#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This bot will search for references which are only made of a link
without title, (i.e. [[http://www.google.fr/]] or
[http://www.google.fr/]) and will fetch the html title from
the link to use it as the title of the wiki link in the reference, i.e.
[[http://www.google.fr/search?q=test test - Google Search]]
The bot checks every 20 edits a special stop page : if
the page has been edited, it stops.
DumZiBoT is running that script on en: & fr: at every new dump, running it on de: is not allowed anymore.
As it uses it, you need to configure noreferences.py for your wiki, or it will not work.
pdfinfo is needed for parsing pdf titles.
See [[:en:User:DumZiBoT/refLinks]] for more information on the bot.
¶ms;
-limit:n Stops after n edits
-xml:dump.xml Should be used instead of a simple page fetching
method from pagegenerators.py for performance and
load issues
-xmlstart Page to start with when using an XML dump
-ignorepdf Do not handle PDF files (handy if you use Windows and
can't get pdfinfo)
Basic pagegenerators commands, -page, etc...
"""
# (C) 2008 - Nicolas Dumazet ( en:User:NicDumZ )
#
# Distributed under the terms of the GPL
__version__ = '$Id$'
from BeautifulSoup import UnicodeDammit
import sys, re, urllib2, httplib, socket, codecs, ftplib
import wikipedia, pagegenerators, noreferences
import subprocess, tempfile, os
import gzip, StringIO
stopPage = {'fr':u'Utilisateur:DumZiBoT/EditezCettePagePourMeStopper',
'de':u'Benutzer:DumZiBoT/EditThisPageToStopMe',
'it':u'Utente:Màrço27Bot/EditThisPageToStopMe',
'ko':u'사용자:GrassnBreadRefBot/EditThisPageToStopMe1',
'hu':'User:Damibot/EditThisPageToStopMe',
'en':u'User:DumZiBoT/EditThisPageToStopMe',
'zh':u'User:Sz-iwbot',
}
msg = { 'fr':u'Bot: Correction des refs. mal formatées, suppression doublons en utilisant des références nommées (cf. [[Utilisateur:DumZiBoT/liensRefs|explications]])',
'de':u'Bot: Korrektes Referenzformat (siehe [[:en:User:DumZiBoT/refLinks]])',
'hu':u'Robot: Forráshivatkozások kibővítése a hivatkozott oldal címével',
'ko':u'봇: url만 있는 주석을 보강, (영문)[[:en:User:DumZiBoT/refLinks]] 참조',
'es':u'Formateando las referencias que no tuvieran títulos (FAQ : [[:en:User:DumZiBoT/refLinks]] )',
'it':u'Bot: Sistemo note con collegamenti esterni senza titolo ([[Utente:Màrço27Bot/refLinks.py|documentazione]])',
'en':u'Bot: Converting bare references, using ref names to avoid duplicates, see [[User:DumZiBoT/refLinks|FAQ]]',
}
deadLinkTag = {'fr':u'[%s] {{lien mort}}',
'de':u'',
'hu':u'[%s] {{halott link}}',
'ko':u'[%s] {{죽은 바깥 고리}}',
'es':u'{{enlace roto2|%s}}',
'it':u'{{Collegamento interrotto|%s}}',
'en':u'[%s] {{dead link}}'}
comment = {'fr':u'Titre généré automatiquement',
'de':u'Automatisch generierter titel',
'hu':u'Robot generálta cím',
'ko':u'봇이 따온 제목',
'es':u'Título generado por un bot',
'en':u'Bot generated title',
'it':u'Titolo generato automaticamente',
'ar':u'عنوان مولد بالبوت'}
soft404 = re.compile(ur'\D404(\D|\Z)|error|errdoc|Not.{0,3}Found|sitedown|eventlog', re.IGNORECASE)
# matches an URL at the index of a website
dirIndex = re.compile(ur'^\w+://[^/]+/((default|index)\.(asp|aspx|cgi|htm|html|phtml|mpx|mspx|php|shtml|var))?$', re.IGNORECASE)
# Extracts the domain name
domain = re.compile(ur'^(\w+)://(?:www.|)([^/]+)')
globalbadtitles = """
# is
(test|
# starts with
^\W*(
register
|registration
|(sign|log)[ \-]?in
|subscribe
|sign[ \-]?up
|log[ \-]?on
|untitled *(document|page|$)
).*
# anywhere
|.*(404|page|file).*not([ ]*be)?[ ]*found.*
# ends with
|.*(
register
|registration
|(sign|log)[ \-]?in
|subscribe|sign[ \-]?up
|log[ \-]?on
)\W*$
)
"""
# Language-specific bad titles
badtitles = { 'en': '',
'fr': '.*(404|page|site).*en +travaux.*',
'es': '.*sitio.*no +disponible.*',
'it': '((pagina|sito) (non trovata|inesistente)|accedi)'
}
autogen = { 'en': 'autogenerated',
'it': 'autogenerato'
}
# Regex that match bare references
linksInRef = re.compile(
# bracketed URLs
ur'(?i)[[^>]*)>\s*\[?(?P(?:http|https|ftp)://(?:' +
# unbracketed with()
ur'^\[\]\s<>"]+\([^\[\]\s<>"]+[^\[\]\s\.:;\\,<>\?"]+|'+
# unbracketed without ()
ur'[^\[\]\s<>"]+[^\[\]\s\)\.:;\\,<>\?"]+|[^\[\]\s<>"]+))[!?,\s]*\]?\s*]')
# Download this file :
# http://www.twoevils.org/files/wikipedia/404-links.txt.gz
# ( maintained by User:Dispenser )
listof404pages = '404-links.txt'
class XmlDumpPageGenerator:
"""Xml generator that yiels pages containing bare references"""
def __init__(self, xmlFilename, xmlStart, namespaces):
self.xmlStart = xmlStart
self.namespaces = namespaces
self.skipping = bool(xmlStart)
self.site = wikipedia.getSite()
import xmlreader
dump = xmlreader.XmlDump(xmlFilename)
self.parser = dump.parse()
def __iter__(self):
return self
def next(self):
while True:
try:
entry = self.parser.next()
except StopIteration:
raise
if self.skipping:
if entry.title != self.xmlStart:
continue
self.skipping = False
page=wikipedia.Page(self.site, entry.title)
if not self.namespaces == []:
if page.namespace() not in self.namespaces:
continue
if linksInRef.search(entry.text):
return page
class RefLink:
"""Container to handle a single bare reference"""
def __init__(self, link, name):
self.refname = name
self.link = link
self.site = wikipedia.getSite()
self.linkComment = wikipedia.translate(self.site, comment)
self.url = re.sub(u'#.*', '', self.link)
self.title = None
def refTitle(self):
"""Returns the [ with its new title"""
return '][[%s %s]]' % (self.refname, self.link, self.title, self.linkComment)
def refLink(self):
"""No title has been found, return the unbracketed link"""
return '[%s]' % (self.refname, self.link)
def refDead(self):
"""Dead link, tag it with a {{dead link}}"""
tag = wikipedia.translate(self.site, deadLinkTag) % self.link
return '[%s]' % (self.refname, tag)
def transform(self, ispdf = False):
"""Normalize the title"""
#convert html entities
if not ispdf:
self.title = wikipedia.html2unicode(self.title)
self.title = re.sub(r'-+', '-', self.title)
#remove formatting, i.e long useless strings
self.title = re.sub(r'[\.+\-=]{4,}', ' ', self.title)
#remove \n and \r and Unicode spaces from titles
self.title = re.sub(r'(?u)\s', ' ', self.title)
self.title = re.sub(r'[\n\r\t]', ' ', self.title)
#remove extra whitespaces
#remove leading and trailing ./;/,/-/_/+/ /
self.title = re.sub(r' +', ' ', self.title.strip(r'=.;,-+_ '))
self.avoid_uppercase()
#avoid closing the link before the end
self.title = self.title.replace(']', ']')
#avoid multiple } being interpreted as a template inclusion
self.title = self.title.replace('}}', '}}')
#prevent multiple quotes being interpreted as '' or '''
self.title = self.title.replace('\'\'', '\''')
self.title = wikipedia.unicode2html(self.title, self.site.encoding())
# TODO : remove HTML when both opening and closing tags are included
def avoid_uppercase(self):
"""
If title has more than 6 characters and has 60% of uppercase
characters, capitalize() it
"""
if len(self.title) <= 6:
return
nb_upper = 0
nb_letter = 0
for letter in self.title:
if letter.isupper():
nb_upper += 1
if letter.isalpha():
nb_letter += 1
if letter.isdigit():
return
if float(nb_upper)/(nb_letter+1) > .70:
self.title = self.title.title()
class DuplicateReferences:
"""
When some references are duplicated in an article,
name the first, and remove the content of the others
"""
def __init__(self):
# Match references
self.REFS = re.compile(u'(?i)[[^>/]*)>(?P.*?)]')
self.NAMES = re.compile(u'(?i).*name\s*=\s*(?P"?)\s*(?P.+)\s*(?P=quote).*')
self.GROUPS = re.compile(u'(?i).*group\s*=\s*(?P"?)\s*(?P.+)\s*(?P=quote).*')
self.autogen = wikipedia.translate(wikipedia.getSite(), autogen)
def process(self, text):
# keys are ref groups
# values are a dict where :
# keys are ref content
# values are [name, [list of full ref matches], quoted, need_to_change]
foundRefs = {}
foundRefNames = {}
# Replace key by [value, quoted]
namedRepl = {}
for match in self.REFS.finditer(text):
content = match.group('content')
if not content.strip():
continue
params = match.group('params')
group = self.GROUPS.match(params)
if not foundRefs.has_key(group):
foundRefs[group] = {}
groupdict = foundRefs[group]
if groupdict.has_key(content):
v = groupdict[content]
v[1].append(match.group())
else:
v = [None, [match.group()], False, False]
name = self.NAMES.match(params)
if name:
quoted = name.group('quote') == '"'
name = name.group('name')
if v[0]:
if v[0] != name:
namedRepl[name] = [v[0], v[2]]
else:
#First name associated with this content
if name == 'population':
wikipedia.output(content)
if not foundRefNames.has_key(name):
# first time ever we meet this name
if name == 'population':
print "in"
v[2] = quoted
v[0] = name
else:
# if has_key, means that this name is used
# with another content. We'll need to change it
v[3] = True
foundRefNames[name] = 1
groupdict[content] = v
id = 1
while foundRefNames.has_key(self.autogen + str(id)):
id += 1
for (g, d) in foundRefs.iteritems():
if g:
group = "group=\"%s\" " % group
else:
group = ""
for (k, v) in d.iteritems():
if len(v[1]) == 1 and not v[3]:
continue
name = v[0]
if not name:
name = self.autogen + str(id)
id += 1
elif v[2]:
name = u'"%s"' % name
named = u'[%s]' % (group, name, k)
text = text.replace(v[1][0], named, 1)
# make sure that the first (named ref) is not
# removed later :
pos = text.index(named) + len(named)
header = text[:pos]
end = text[pos:]
unnamed = u'' % (group, name)
for ref in v[1][1:]:
end = end.replace(ref, unnamed)
text = header + end
for (k,v) in namedRepl.iteritems():
# TODO : Support ref groups
name = v[0]
if v[1]:
name = u'"%s"' % name
text = re.sub(u'["?)\s*%s\s*(?P=quote)\s*/>' % k, u']' % name, text)
return text
class ReferencesRobot:
def __init__(self, generator, acceptall = False, limit = None, ignorepdf = False ):
"""
- generator : Page generator
- acceptall : boolean, is -always on ?
- limit : int, stop after n modified pages
- ignorepdf : boolean
"""
self.generator = generator
self.acceptall = acceptall
self.limit = limit
self.ignorepdf = ignorepdf
self.site = wikipedia.getSite()
self.stopPage = wikipedia.Page(self.site, wikipedia.translate(self.site, stopPage))
local = wikipedia.translate(self.site, badtitles)
if local:
bad = '(' + globalbadtitles + '|' + local + ')'
else:
bad = globalbadtitles
self.titleBlackList = re.compile(bad, re.I | re.S | re.X)
self.norefbot = noreferences.NoReferencesBot(None)
self.deduplicator = DuplicateReferences()
try :
self.stopPageRevId = self.stopPage.latestRevision()
except wikipedia.NoPage :
wikipedia.output(u'The stop page %s does not exist'
% self.stopPage.aslink())
raise
# Regex to grasp content-type meta HTML tag in HTML source
self.META_CONTENT = re.compile(ur'(?i)]*content\-type[^>]*>')
# Extract the encoding from a charset property (from content-type !)
self.CHARSET = re.compile(ur'(?i)charset\s*=\s*(?P[^\'";>/]*)')
# Extract html title from page
self.TITLE = re.compile(ur'(?is)(?<=).*?(?=)')
# Matches content inside |||')
# Authorized mime types for HTML pages
self.MIME = re.compile(ur'application/(?:xhtml\+xml|xml)|text/(?:ht|x)ml')
def put_page(self, page, new):
"""
Prints diffs between orginal and new (text), puts new text for page
"""
wikipedia.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"
% page.title())
wikipedia.showDiff(page.get(), new)
if not self.acceptall:
choice = wikipedia.inputChoice(u'Do you want to accept ' +
u'these changes?',
['Yes', 'No', 'All'],
['y', 'N', 'a'], 'N')
if choice == 'a':
self.acceptall = True
if choice == 'y':
page.put_async(new)
if self.acceptall:
try:
page.put(new)
except wikipedia.EditConflict:
wikipedia.output(u'Skipping %s because of edit conflict'
% (page.title(),))
except wikipedia.SpamfilterError, e:
wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url))
except wikipedia.PageNotSaved, error:
wikipedia.output(u'Error putting page: %s' % (error.args,))
except wikipedia.LockedPage:
wikipedia.output(u'Skipping %s (locked page)'
% (page.title(),))
except wikipedia.ServerError, e:
wikipedia.output(u'Server Error : %s' % e)
def httpError(self, err_num, link, pagetitleaslink):
"""Log HTTP Error"""
wikipedia.output(u'HTTP error (%s) for %s on %s'
% (err_num, link, pagetitleaslink),
toStdout = True)
def getPDFTitle(self, ref, f):
"""
Use pdfinfo to retrieve title from a PDF.
Unix-only, I'm afraid.
"""
wikipedia.output( u'PDF file.' )
fd, infile = tempfile.mkstemp()
urlobj = os.fdopen(fd, 'r+w')
urlobj.write(f.read())
try:
pdfinfo_out = subprocess.Popen([r"pdfinfo","/dev/stdin"], stdin=urlobj, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False).communicate()[0]
for aline in pdfinfo_out.splitlines():
if aline.lower().startswith('title'):
ref.title = aline.split(None)[1:]
ref.title = ' '.join(ref.title)
if ref.title != '': wikipedia.output(u'title: ' +ref.title )
wikipedia.output( u'PDF done.' )
except ValueError:
wikipedia.output( u'pdfinfo value error.' )
except OSError:
wikipedia.output( u'pdfinfo OS error.' )
except: # Ignore errors
wikipedia.output( u'PDF processing error.' )
pass
finally:
urlobj.close()
os.unlink(infile)
def run(self):
"""
Runs the Bot
"""
wikipedia.setAction(wikipedia.translate(self.site, msg))
try:
deadLinks = codecs.open(listof404pages, 'r', 'latin_1').read()
except IOError:
wikipedia.output('You need to download http://www.twoevils.org/files/wikipedia/404-links.txt.gz and to ungzip it in the same directory')
raise
socket.setdefaulttimeout(30)
editedpages = 0
for page in self.generator:
try:
# Load the page's text from the wiki
new_text = page.get()
if not page.canBeEdited():
wikipedia.output(u"You can't edit page %s"
% page.aslink())
continue
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found' % page.aslink())
continue
except wikipedia.IsRedirectPage:
wikipedia.output(u'Page %s is a redirect' % page.aslink())
continue
for match in linksInRef.finditer(wikipedia.removeDisabledParts(page.get())):
#for each link to change
link = match.group(u'url')
#debugging purpose
#print link
if u'jstor.org' in link:
#TODO: Clean URL blacklist
continue
ref = RefLink(link, match.group('name'))
f = None
try:
socket.setdefaulttimeout(20)
req = urllib2.Request(ref.url)
req.add_header('User-agent', 'reflinks.py (+http://toolserver.org/~dispenser/view/Reflinks)')
req.add_header('Accept-Encoding', 'gzip')
f = urllib2.urlopen(req)
#Try to get Content-Type from server
headers = f.info()
contentType = headers.getheader('Content-Type')
if contentType and not self.MIME.search(contentType):
if ref.link.lower().endswith('.pdf') and not self.ignorepdf:
# If file has a PDF suffix
self.getPDFTitle(ref, f)
else:
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : media : %s ' % ref.link)
if ref.title:
if not re.match('(?i) *microsoft (word|excel|visio)', ref.title):
ref.transform(ispdf=True)
repl = ref.refTitle()
else:
wikipedia.output('\03{lightyellow}WARNING\03{default} : PDF title blacklisted : %s ' % ref.title)
repl = ref.refLink()
else:
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
# Get the real url where we end (http redirects !)
redir = f.geturl()
if redir != ref.link and domain.findall(redir) == domain.findall(link):
if soft404.search(redir) and not soft404.search(ref.link):
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect 404 : %s ' % ref.link)
continue
if dirIndex.match(redir) and not dirIndex.match(ref.link):
wikipedia.output(u'\03{lightyellow}WARNING\03{default} : Redirect to root : %s ' % ref.link)
continue
# Read the first 1,000,000 bytes (0.95 MB)
if headers.get('Content-Encoding') in ('gzip', 'x-gzip'):
linkedpagetext= gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read(1000000)
else:
linkedpagetext = f.read(1000000)
socket.setdefaulttimeout(None)
except UnicodeError:
#example : http://www.adminet.com/jo/20010615¦/ECOC0100037D.html in [[fr:Cyanure]]
wikipedia.output(u'\03{lightred}Bad link\03{default} : %s in %s' % (ref.url, page.aslink()))
continue
except urllib2.HTTPError, e:
wikipedia.output(u'HTTP error (%s) for %s on %s'
% (e.code, ref.url, page.aslink()),
toStdout = True)
# 410 Gone, indicates that the resource has been purposely removed
if e.code == 410 or (e.code == 404 and (u'\t%s\t' % ref.url in deadLinks)):
repl = ref.refDead()
new_text = new_text.replace(match.group(), repl)
continue
except (urllib2.URLError,
socket.error,
IOError,
httplib.error), e:
#except (urllib2.URLError, socket.timeout, ftplib.error, httplib.error, socket.error), e:
wikipedia.output(u'Can\'t retrieve page %s : %s' % (ref.url, e))
continue
except ValueError:
#Known bug of httplib, google for :
#"httplib raises ValueError reading chunked content"
continue
finally:
if f:
f.close()
#remove