#!/usr/bin/env python # -*- coding: utf-8 -*- """ This bot will search for references which are only made of a link without title, (i.e. [http://www.google.fr/] or http://www.google.fr/) and will fetch the html title from the link to use it as the title of the wiki link in the reference, i.e. [http://www.google.fr/search?q=test test - Google Search] The bot checks every 20 edits a special stop page : if the page has been edited, it stops. DumZiBoT is running that script on en: & fr: at every new dump, running it on de: is not allowed anymore. As it uses it, you need to configure noreferences.py for your wiki, or it will not work. pdfinfo is needed for parsing pdf titles. See [[:en:User:DumZiBoT/refLinks]] for more information on the bot. ¶ms; -limit:n Stops after n edits -xml:dump.xml Should be used instead of a simple page fetching method from pagegenerators.py for performance and load issues -xmlstart Page to start with when using an XML dump -ignorepdf Do not handle PDF files (handy if you use Windows and can't get pdfinfo) Basic pagegenerators commands, -page, etc... """ # (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ ) # # Distributed under the terms of the GPL __version__ = '$Id$' from BeautifulSoup import UnicodeDammit import sys, re, urllib2, httplib, socket, codecs, ftplib import wikipedia, pagegenerators, noreferences import subprocess, tempfile, os import gzip, StringIO stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper', 'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe', 'it':u'Utente:Màrço27Bot/EditThisPageToStopMe', 'ko':u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1', 'hu':'User:Damibot/EditThisPageToStopMe', 'en':u'User:DumZiBoT/EditThisPageToStopMe', 'zh':u'User:Sz-iwbot', } msg = { 'fr':u'Bot: Correction des refs. mal formatées, suppression doublons en utilisant des références nommées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])', 'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])', 'hu':u'Robot: Forráshivatkozások kibővítése a hivatkozott oldal címével', 'ko':u'봇: url만 있는 주석을 보강, (영문)[[:en:User:DumZiBoT/refLinks]] 참조', 'es':u'Formateando las referencias que no tuvieran títulos (FAQ : [[:en:User:DumZiBoT/refLinks]] )', 'it':u'Bot: Sistemo note con collegamenti esterni senza titolo ([[Utente:Màrço27Bot/refLinks.py|documentazione]])', 'en':u'Bot: Converting bare references, using ref names to avoid duplicates, see [[User:DumZiBoT/refLinks|FAQ]]', } deadLinkTag = {'fr':u'[%s] {{lien mort}}', 'de':u'', 'hu':u'[%s] {{halott link}}', 'ko':u'[%s] {{죽은 바깥 고리}}', 'es':u'{{enlace roto2|%s}}', 'it':u'{{Collegamento interrotto|%s}}', 'en':u'[%s] {{dead link}}'} comment = {'fr':u'Titre généré automatiquement', 'de':u'Automatisch generierter titel', 'hu':u'Robot generálta cím', 'ko':u'봇이 따온 제목', 'es':u'Título generado por un bot', 'en':u'Bot generated title', 'it':u'Titolo generato automaticamente', 'ar':u'عنوان مولد بالبوت'} soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE) # matches an URL at the index of a website dirIndex = re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE) # Extracts the domain name domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)') globalbadtitles = """ # is (test| # starts with ^\W*( register |registration |(sign|log)[ \-]?in |subscribe |sign[ \-]?up |log[ \-]?on |untitled *(document|page|$) ).* # anywhere |.*(404|page|file).*not([ ]*be)?[ ]*found.* # ends with |.*( register |registration |(sign|log)[ \-]?in |subscribe|sign[ \-]?up |log[ \-]?on )\W*$ ) """ # Language-specific bad titles badtitles = { 'en': '', 'fr': '.*(404|page|site).*en +travaux.*', 'es': '.*sitio.*no +disponible.*', 'it': '((pagina|sito) (non trovata|inesistente)|accedi)' } autogen = { 'en': 'autogenerated', 'it': 'autogenerato' } # Regex that match bare references linksInRef = re.compile( # bracketed URLs ur'(?i)[^>]*)>\s*\[?(?P(?:http|https|ftp)://(?:' + # unbracketed with() ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+ # unbracketed without () ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*') # Download this file : # http://www.twoevils.org/files/wikipedia/404-links.txt.gz # ( maintained by User:Dispenser ) listof404pages = '404-links.txt' class XmlDumpPageGenerator: """Xml generator that yiels pages containing bare references""" def __init__(self, xmlFilename, xmlStart, namespaces): self.xmlStart = xmlStart self.namespaces = namespaces self.skipping = bool(xmlStart) self.site = wikipedia.getSite() import xmlreader dump = xmlreader.XmlDump(xmlFilename) self.parser = dump.parse() def __iter__(self): return self def next(self): while True: try: entry = self.parser.next() except StopIteration: raise if self.skipping: if entry.title != self.xmlStart: continue self.skipping = False page=wikipedia.Page(self.site, entry.title) if not self.namespaces == []: if page.namespace() not in self.namespaces: continue if linksInRef.search(entry.text): return page class RefLink: """Container to handle a single bare reference""" def __init__(self, link, name): self.refname = name self.link = link self.site = wikipedia.getSite() self.linkComment = wikipedia.translate(self.site, comment) self.url = re.sub(u'#.*', '', self.link) self.title = None def refTitle(self): """Returns the with its new title""" return '[%s %s]' % (self.refname, self.link, self.title, self.linkComment) def refLink(self): """No title has been found, return the unbracketed link""" return '%s' % (self.refname, self.link) def refDead(self): """Dead link, tag it with a {{dead link}}""" tag = wikipedia.translate(self.site, deadLinkTag) % self.link return '%s' % (self.refname, tag) def transform(self, ispdf = False): """Normalize the title""" #convert html entities if not ispdf: self.title = wikipedia.html2unicode(self.title) self.title = re.sub(r'-+', '-', self.title) #remove formatting, i.e long useless strings self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title) #remove \n and \r and Unicode spaces from titles self.title = re.sub(r'(?u)\s', ' ', self.title) self.title = re.sub(r'[\n\r\t]', ' ', self.title) #remove extra whitespaces #remove leading and trailing ./;/,/-/_/+/ / self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ ')) self.avoid_uppercase() #avoid closing the link before the end self.title = self.title.replace(']', ']') #avoid multiple } being interpreted as a template inclusion self.title = self.title.replace('}}', '}}') #prevent multiple quotes being interpreted as '' or ''' self.title = self.title.replace('\'\'', '\''') self.title = wikipedia.unicode2html(self.title, self.site.encoding()) # TODO : remove HTML when both opening and closing tags are included def avoid_uppercase(self): """ If title has more than 6 characters and has 60% of uppercase characters, capitalize() it """ if len(self.title) <= 6: return nb_upper = 0 nb_letter = 0 for letter in self.title: if letter.isupper(): nb_upper += 1 if letter.isalpha(): nb_letter += 1 if letter.isdigit(): return if float(nb_upper)/(nb_letter+1) > .70: self.title = self.title.title() class DuplicateReferences: """ When some references are duplicated in an article, name the first, and remove the content of the others """ def __init__(self): # Match references self.REFS = re.compile(u'(?i)[^>/]*)>(?P.*?)') self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P"?)\s*(?P.+)\s*(?P=quote).*') self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P"?)\s*(?P.+)\s*(?P=quote).*') self.autogen = wikipedia.translate(wikipedia.getSite(), autogen) def process(self, text): # keys are ref groups # values are a dict where : # keys are ref content # values are [name, [list of full ref matches], quoted, need_to_change] foundRefs = {} foundRefNames = {} # Replace key by [value, quoted] namedRepl = {} for match in self.REFS.finditer(text): content = match.group('content') if not content.strip(): continue params = match.group('params') group = self.GROUPS.match(params) if not foundRefs.has_key(group): foundRefs[group] = {} groupdict = foundRefs[group] if groupdict.has_key(content): v = groupdict[content] v[1].append(match.group()) else: v = [None, [match.group()], False, False] name = self.NAMES.match(params) if name: quoted = name.group('quote') == '"' name = name.group('name') if v[0]: if v[0] != name: namedRepl[name] = [v[0], v[2]] else: #First name associated with this content if name == 'population': wikipedia.output(content) if not foundRefNames.has_key(name): # first time ever we meet this name if name == 'population': print "in" v[2] = quoted v[0] = name else: # if has_key, means that this name is used # with another content. We'll need to change it v[3] = True foundRefNames[name] = 1 groupdict[content] = v id = 1 while foundRefNames.has_key(self.autogen + str(id)): id += 1 for (g, d) in foundRefs.iteritems(): if g: group = "group=\"%s\" " % group else: group = "" for (k, v) in d.iteritems(): if len(v[1]) == 1 and not v[3]: continue name = v[0] if not name: name = self.autogen + str(id) id += 1 elif v[2]: name = u'"%s"' % name named = u'%s' % (group, name, k) text = text.replace(v[1][0], named, 1) # make sure that the first (named ref) is not # removed later : pos = text.index(named) + len(named) header = text[:pos] end = text[pos:] unnamed = u'' % (group, name) for ref in v[1][1:]: end = end.replace(ref, unnamed) text = header + end for (k,v) in namedRepl.iteritems(): # TODO : Support ref groups name = v[0] if v[1]: name = u'"%s"' % name text = re.sub(u'"?)\s*%s\s*(?P=quote)\s*/>' % k, u'' % name, text) return text class ReferencesRobot: def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ): """ - generator : Page generator - acceptall : boolean, is -always on ? - limit : int, stop after n modified pages - ignorepdf : boolean """ self.generator = generator self.acceptall = acceptall self.limit = limit self.ignorepdf = ignorepdf self.site = wikipedia.getSite() self.stopPage = wikipedia.Page(self.site, wikipedia.translate(self.site, stopPage)) local = wikipedia.translate(self.site, badtitles) if local: bad = '(' + globalbadtitles + '|' + local + ')' else: bad = globalbadtitles self.titleBlackList = re.compile(bad, re.I | re.S | re.X) self.norefbot = noreferences.NoReferencesBot(None) self.deduplicator = DuplicateReferences() try : self.stopPageRevId = self.stopPage.latestRevision() except wikipedia.NoPage : wikipedia.output(u'The stop page %s does not exist' % self.stopPage.aslink()) raise # Regex to grasp content-type meta HTML tag in HTML source self.META_CONTENT = re.compile(ur'(?i)]*content\-type[^>]*>') # Extract the encoding from a charset property (from content-type !) self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P[^\'";>/]*)') # Extract html title from page self.TITLE = re.compile(ur'(?is)(?<=).*?(?=)') # Matches content inside |]*>.*?||') # Authorized mime types for HTML pages self.MIME = re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml') def put_page(self, page, new): """ Prints diffs between orginal and new (text), puts new text for page """ wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) wikipedia.showDiff(page.get(), new) if not self.acceptall: choice = wikipedia.inputChoice(u'Do you want to accept ' + u'these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') if choice == 'a': self.acceptall = True if choice == 'y': page.put_async(new) if self.acceptall: try: page.put(new) except wikipedia.EditConflict: wikipedia.output(u'Skipping %s because of edit conflict' % (page.title(),)) except wikipedia.SpamfilterError, e: wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) except wikipedia.PageNotSaved, error: wikipedia.output(u'Error putting page: %s' % (error.args,)) except wikipedia.LockedPage: wikipedia.output(u'Skipping %s (locked page)' % (page.title(),)) except wikipedia.ServerError, e: wikipedia.output(u'Server Error : %s' % e) def httpError(self, err_num, link, pagetitleaslink): """Log HTTP Error""" wikipedia.output(u'HTTP error (%s) for %s on %s' % (err_num, link, pagetitleaslink), toStdout = True) def getPDFTitle(self, ref, f): """ Use pdfinfo to retrieve title from a PDF. Unix-only, I'm afraid. """ wikipedia.output( u'PDF file.' ) fd, infile = tempfile.mkstemp() urlobj = os.fdopen(fd, 'r+w') urlobj.write(f.read()) try: pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False).communicate()[0] for aline in pdfinfo_out.splitlines(): if aline.lower().startswith('title'): ref.title = aline.split(None)[1:] ref.title = ' '.join(ref.title) if ref.title != '': wikipedia.output(u'title: ' +ref.title ) wikipedia.output( u'PDF done.' ) except ValueError: wikipedia.output( u'pdfinfo value error.' ) except OSError: wikipedia.output( u'pdfinfo OS error.' ) except: # Ignore errors wikipedia.output( u'PDF processing error.' ) pass finally: urlobj.close() os.unlink(infile) def run(self): """ Runs the Bot """ wikipedia.setAction(wikipedia.translate(self.site, msg)) try: deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read() except IOError: wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory') raise socket.setdefaulttimeout(30) editedpages = 0 for page in self.generator: try: # Load the page's text from the wiki new_text = page.get() if not page.canBeEdited(): wikipedia.output(u"You can't edit page %s" % page.aslink()) continue except wikipedia.NoPage: wikipedia.output(u'Page %s not found' % page.aslink()) continue except wikipedia.IsRedirectPage: wikipedia.output(u'Page %s is a redirect' % page.aslink()) continue for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())): #for each link to change link = match.group(u'url') #debugging purpose #print link if u'jstor.org' in link: #TODO: Clean URL blacklist continue ref = RefLink(link, match.group('name')) f = None try: socket.setdefaulttimeout(20) req = urllib2.Request(ref.url) req.add_header('User-agent', 'reflinks.py (+http://toolserver.org/~dispenser/view/Reflinks)') req.add_header('Accept-Encoding', 'gzip') f = urllib2.urlopen(req) #Try to get Content-Type from server headers = f.info() contentType = headers.getheader('Content-Type') if contentType and not self.MIME.search(contentType): if ref.link.lower().endswith('.pdf') and not self.ignorepdf: # If file has a PDF suffix self.getPDFTitle(ref, f) else: wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link) if ref.title: if not re.match('(?i) *microsoft (word|excel|visio)', ref.title): ref.transform(ispdf=True) repl = ref.refTitle() else: wikipedia.output('\03{lightyellow}WARNING\03{default} : PDF title blacklisted : %s ' % ref.title) repl = ref.refLink() else: repl = ref.refLink() new_text = new_text.replace(match.group(), repl) continue # Get the real url where we end (http redirects !) redir = f.geturl() if redir != ref.link and domain.findall(redir) == domain.findall(link): if soft404.search(redir) and not soft404.search(ref.link): wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % ref.link) continue if dirIndex.match(redir) and not dirIndex.match(ref.link): wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link) continue # Read the first 1,000,000 bytes (0.95 MB) if headers.get('Content-Encoding') in ('gzip', 'x-gzip'): linkedpagetext= gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(1000000) else: linkedpagetext = f.read(1000000) socket.setdefaulttimeout(None) except UnicodeError: #example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]] wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.aslink())) continue except urllib2.HTTPError, e: wikipedia.output(u'HTTP error (%s) for %s on %s' % (e.code, ref.url, page.aslink()), toStdout = True) # 410 Gone, indicates that the resource has been purposely removed if e.code == 410 or (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)): repl = ref.refDead() new_text = new_text.replace(match.group(), repl) continue except (urllib2.URLError, socket.error, IOError, httplib.error), e: #except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e: wikipedia.output(u'Can\'t retrieve page %s : %s' % (ref.url, e)) continue except ValueError: #Known bug of httplib, google for : #"httplib raises ValueError reading chunked content" continue finally: if f: f.close() #remove