#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Please read [[WP:Disambiguation dos and don'ts]] before using this tool.
"""
text_test ="""
TODO
* Remove duplicate links (including if they redirect to the same place)
* detect self links (i.e. warn about unintended circular links)
* FIXME spaces removed when removing [[priamary]], [[a]] [[b]]
* Add/remove prefixes/suffix, e.g. untether -> tether -> tethering or "Lifetime" -> "A lifetime"/"The lifetime"
-------
Test cases:
* [[IOS]]
* [[Riku]], [[Lulu]], [[Yuna]]
* [[Sandy]] (birth dates, suggest prefix index?)
* [[Pepe]] - better auto date formatting
* [[Dreamweaver (disambiguation)]] - Primary links
* [[Ikeda]]
* [[Rashomon]] incorrect primary link
* Dates: [[Julia]]
Acid tests:
* [[( ) (disambiguation)]]
"""
import re, sys, MySQLdb
import wikipedia
import cgitb; cgitb.enable(logdir='tracebacks')
site = None #wikipedia.getSite()
headings = []
CatPlaces = r'_places_|_communities_|_constituencies_|_Country,_|_counties$|^Barangays_of_|^Cities_|^Plantations_|^Suburbs_of_|^Towns_|^Townships_in_|^Villages_|^Wards_of_|^Woredas_of_|micropolitan_area$|parishes$|_geography_stubs$'
RelinkText = ur"(?mi)^([*#]+[ '\"]*)(%s)([ '\"]*(?:[,\-–—]|''|\"| is | were |$))"
from related import dabcatlang
def bullets(links, sortkey=None):
return '\n'.join("* [[%s]]"%s.replace('_',' ') for s in links)
def likeescape(s, escape='\\'):
return s.replace('\\','\\\\').replace('_', '\\_').replace('%', '\\%')
def EnglishJoin(list):
if len(list) <= 1:
return ''.join(list)
elif len(list) == 2:
return ' or '.join(list)
else:
return ', '.join(list[:-1]) + ', or ' + list[-1]
def printu(s):
print (s.encode('utf-8') if isinstance(s, unicode) else s)
def htmlout(string, data=[]):
s = string % tuple(wikipedia.escape("%s"%value) if isinstance(value, (str, unicode)) else value for value in data)
if isinstance(s, bytes): print(s)
else: print(s.encode('utf-8'))
def debug(s):
htmlout('
%s
', (s,))
def info(s):
printu('')
wikipedia.output("%s" % s)
printu('
')
def warn(s):
printu('')
wikipedia.output("\03{lightorange}WARNING\03{default}: %s" % s)
printu('
')
def error(s):
printu('')
wikipedia.output(u"ERROR: %s" % s)
printu('
')
def CreateLink(link, title=None, className="", addAttribute=''):
if not title: title = link.replace('_', ' ')
attributes = ' class="'+className+'"' if className else ''
if addAttribute:
attributes += ' '+addAttribute
return '%s ' % (
site.hostname(),
site.nice_get_address(wikipedia.urllib.quote((link.encode('utf-8') if isinstance(link, unicode) else link).replace(' ','_'), safe=";@$!*(),/:-_.")),
wikipedia.escape(link.replace('_',' ')),
attributes,
wikipedia.escape(title),
)
import time; starttime=time.time()
def heading(level, title, style="", className=""):
wikipedia.logtime(headings[-1] if headings else '?')
global starttime
print("\n" % (time.time()-starttime,))
starttime = time.time()
htmlout('%s ' % (level, wikipedia.sectionencode(title), ' style="%s"'%style if style else '', ' class="%s"'%className if className else '', title, wikipedia.sectionencode(title), level))
#print ""
sys.stdout.flush()
headings.append(title)
def canonicalTitle(title, firstupper=True, underscore=False):
"""
Converts unicode or bytes string to mw titles
support: percent-encoded UTF-8, HTML character references
"""
# TODO namespace support, e.g. [[WP: Foo]]
if isinstance(title, unicode):
title = title.encode('utf-8')
# Unpercent-encode
title = wikipedia.urllib.unquote(title)
try: title = unicode(title, 'utf-8')
except:title = unicode(title, 'latin-1')
# HTML character references
title = wikipedia.html2unicode(title)
# Remove ltr and rtl markers
title = title.replace(u'\u200e', '').replace(u'\u200f', '')
# Strip the section part
if '#' in title:
title = title[:title.index('#')]
# Underscore to space and Strip space
title = title.replace('_', ' ').strip().lstrip(':')
# Merge multiple spaces
while ' ' in title:
title = title.replace(' ', ' ')
# First uppercase
if firstupper and title:
title = title[0:1].upper() + title[1:]
if underscore:
title = title.replace(' ', '_')
return title
def wikilinkregex(t, firstupper=True):
t = canonicalTitle(t, firstupper)
# Workaround for titles with an escape char
if firstupper:
t = ur'[%s%s]%s' % (t[0].upper(), t[0].lower(), t[1:],)
t = re.escape(t).replace('\\[', '[', 1).replace('\\]', ']', 1)
return t.replace('\\ ', '[ _]+').replace('\\|', '|')
connections = {}
def getConn(dbname, host=None):
if not dbname.endswith('_p'): dbname+='_p'
if (host,dbname) in connections:
try:connections[host,dbname].ping()
except:del connections[host,dbname]
if (host,dbname) not in connections:
connections[host,dbname] = MySQLdb.connect(
db=dbname,
host=host or dbname.replace('_', '-')+'.rrdb.toolserver.org',
#host=host or dbname.replace('_', '-')+'.rrdb.toolserver.org',
read_default_file='/home/dispenser/.my.cnf',
# WMF's databases varbinary so it'll always be return in UTF-8 byte string
# charset Option for wiktionary
charset=None if 'wiktionary' in dbname else 'utf8',
use_unicode=False
)
return connections[host,dbname]
#def dropConn(dbname=None):
# for key in (dbname,) if dbname else connections:
# if key in connections:
# del connections[key]
html_cache = {}
def getParsedText(title):
title = canonicalTitle(title, underscore=True)
if title not in html_cache:
urlname = wikipedia.urllib.quote(title.encode('utf-8'), safe=";@$!*(),/:-_.")
html = site.getUrl(site.nice_get_address(urlname)).decode('utf-8')
# XXX Vector skin specific
html_cache[title] = html[html.index(''):html.index(""):html.index("")]
return html_cache[title]
extract_summary_R = re.compile(r'
(?:.*["\',]* (?:is|was|were|are)(?= )|.*?.*? )(\'\'|"|,| \((?:\(.*?\)|[^(\n])*?\)|[\w \t]*\'*.*? \'*| is| was| were| are| or)* *(?P.+?)[,.:;]?
', re.M)
def getsummary(title):
s = getParsedText(title)
info("Download [[%s]] for description" % title)
# Avoid section redirects, rd_fragment isn't complete yet
if 'redirectToFragment' in s:
# redirectToFragment("#Corkscrew_Senton");
info("[[%s]] is a section redirect"%title)
return ''
if ' id="disambigbox"' in s or ' id="setindexbox"' in s:
info("[[%s]] disambiguation page"%title)
return ''
s = s.replace('
\n', ' ')
s = re.sub(r'
]+class="reference">.*? |||
|?(?![bip]\b)\w+\b.*?>', '', s, flags=re.DOTALL)
s = re.sub(r'(?u)[^\S\n]+', ' ', s) # convert emsp
s = s.replace('', "''").replace(' ', "''")
m = extract_summary_R.search(s)
# FIXME use HTML instead
if m:
#wikipedia.output("\03{lightsilver}%s\03{default}%s\03{lightsilver}%s\03{default}"%(m.group()[:m.start(2)-m.start()], m.group(2), m.group()[m.end(2)-m.start():],))
s = m.group(2)
if len(s) > 250:
s = re.sub(ur'(.*?)\.((?:"|\'\'|) +[A-Z].*|$)', r'\1', s)
info("Triming string from %d to %d bytes (%+d)"%(len(m.group()), len(s), len(s)-len(m.group()),))
wikipedia.output(wikipedia.unescape("\03{lightsilver}%s\03{default}"%m.group().replace(s, '\03{default}%s\03{lightsilver}'%s)))
s = ', %s'%wikipedia.unescape(s).strip()
if s.strip(',. '):
# FIXME ", American actor" matched ", American actor and musician"
print ''%(wikipedia.jsescape(title), wikipedia.jsescape(s),)
return s
else:
error("Unable to get extract from [[%s]]'s HTML" % title)
print '%s '%(wikipedia.escape(re.sub(r'\s*\n\s*\n\s*','\n\n', s)).encode('utf-8'),)
return ''
# TODO move to commonfixes
replacementset = {
# Unicode to ASCII
u'−': '-', # minus sign
u'–': '-', # en dash
u'—': '-', # em dash
u'…': '...', # ellipsis
u'×': 'x', # times
u'“': '"',
u'”': '"',
# ASCII approximations and substitutions
' -': '-',
'- ': '-',
'--': '-',
'_': ' ',
'`': "'",
'/': '-',
'*': 'x',
# language approximations
'ae': 'a',
'ey': 'ei',
'oh': 'o',
'ou': 'o',
'uu': 'u',
" 'n": ' an',
' and ': ' & ',
' the ': ' ',
' of ': ' in ',
'k': 'c',
}
import unicodedata
def strip_accents(s):
return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
def strip_variations(s):
# returns
i = None
if ' (' in s: i = s.find(' (')
#if ', ' in s: i = s.rfind(', ')
s = " %s " % s[:i].lower()
s = strip_accents(s)
for c1, c2 in replacementset.iteritems():
s=s.replace(c1, c2)
for c in ' !"\',-.:;?':
s=s.replace(c, '')
return s.upper()
def skipredirect(m):
# TODO consider [[Aude (river)]] the same as [[Aude River]]
# TODO add variable for acceptable changes to qualifier [0,1], where 1 allow complete change
title = m.group()
# `cursor` been globally defined
cursor.execute("""/* dabfix.skipredirect LIMIT:30 */
SELECT rd_title, rd_fragment
FROM page
JOIN redirect ON page_id=rd_from
WHERE page_namespace=%s AND page_title=%s
AND rd_namespace=page_namespace
-- Field is currently not populated
-- AND rd_interwiki = ""
""", (0, canonicalTitle(title, underscore=True).encode('utf-8'),))
results = cursor.fetchall()
if results:
target = results[0][0].replace('_', ' ').decode('utf-8')
if results[0][1]:
# avoid bypassing section redirects
return m.group()
if any(c in title for c in '(,') and not any(c in target for c in '(,'):
# don't lose the qualifier
pass
info("Found redirect: [[%s]] to [[%s]]" % (title, target, ))
elif strip_variations(target)==strip_variations(title):
info("Bypassing redirect [[%s]] to [[%s]]" % (title, target, ))
if title[0].islower():
return target[0].lower() + target[1:]
else:
return target
else:
debug('Comparing %r to %r' % (strip_variations(title), strip_variations(target),))
info("Redirect: [[%s]] to [[%s]]" % (title, target, ))
return m.group()
def parseline(line):
def getYear(s):
# 47 BC
# 2nd-century BC
m = re.match(r'(\d+)(s(?= )|)((?:st|nd|rd|th)-century|)( BC|)', s.replace('_', ' '))
if m:
return "%s%s%s%s"%(
'c. ' if m.group(2) else '',
m.group(1),
m.group(3),
" BC" if m.group(4)==" BC" else ''
)
return None
def yearRange(birth, death, born="born", died="died"):
"""
Formats birth and death years so "AD" is hidden for the modern era
Accepts: 17 century/1860s/c. 1867/1867 [BC|AD]
"""
def n(tup):
return " BC" if tup[1] else "" if tup[0]=="?" or len(tup[0])>=3 else " AD"
death = death.partition(' BC') if death else None
birth = birth.partition(' BC') if birth else None
if birth and not death: return u"%s %s%s" % (born, birth[0], n(birth),)
elif birth and death:
if birth[1]==death[1]: return u"%s–%s%s" % (birth[0], death[0], n(death),)
else: return u"%s BC–%s AD"%(birth[0], death[0],)
elif not birth and death: return u"%s %s%s" % (died, death[0], n(death),)
else: return u""
def cmpr(a, b): # A a subset of B
#printu("Comparing %s to %s "%(a,b))
if ''.join(re.split(r'[^A-Z0-9]+', b)) == a.upper(): # Initialisms
return True
else:
return strip_variations(a) in strip_variations(b)
text = line.group()
if '[[' not in text:
print '\n \n'
return text
else:
# Fix formatting of primary link
text = re.sub(ur"(?m)^\* *(''|\")\[\[([^{|}[\]\n]+)( \([^{|}[\]\n]+\))\]\]([, ]*)\1", ur"* [[\2\3|\1\2\1\3]]\4", text)
# Simple [[MOS:DAB]] corrections
text = re.sub(ur"^([^.]*?)(?Please create %s with:' % CreateLink(t+' (disambiguation)', className="new"))
htmlout('#REDIRECT [[%s]] ', (t.replace('_', ' '),))
return m.group()
else:
if m.group(2):
return m.expand(r'[[\1 (disambiguation)\2]]')
else:
return m.expand(r'[[\1 (disambiguation)|\1]]')
text = re.sub(r'\[\[([^{|}[\]\n]+)(\|.*?|)\]\]( *\(disambiguation\)|)', toDisambiguation, text)
# We now build the dictionary `links` telling us what is safe to unlink
# This is done marking links which are or have a redirect that is a subset
# of the `prefixes` list
#
links = {}
redlink = None
primarylink = None
titles_R = re.compile(r'(?<=\[\[)[^{|}[\]\n]+?(?=\s*(?:\|.*?|)\]\])')
for title in titles_R.findall(text):
if not primarylink:
primarylink = title
cursor.execute("""
/* dabfix.parseline LIMIT:30 */
SELECT page.page_namespace, page.page_title,
rd.page_namespace, rd.page_title, rd_fragment,
GROUP_CONCAT(IF(pp_value IS NULL, cl_to, NULL) SEPARATOR '|'),
(SELECT pp_value FROM page_props WHERE pp_page=IFNULL(rd.page_id, page.page_id) AND pp_propname="displaytitle")
FROM page
LEFT JOIN redirect ON rd_from = page.page_id
LEFT JOIN page AS rd ON rd.page_namespace = rd_namespace AND rd.page_title = rd_title
JOIN categorylinks ON cl_from = IFNULL(rd.page_id, page.page_id)
LEFT JOIN page AS catpage ON catpage.page_namespace = 14 AND catpage.page_title = cl_to
LEFT JOIN page_props ON pp_page = catpage.page_id AND pp_propname = "hiddencat"
WHERE page.page_namespace=%s
AND page.page_title = %s
GROUP BY page.page_title
LIMIT 1
""", (0, canonicalTitle(title, underscore=True),))
result = cursor.fetchone() or (None,)*7
# MySQL's default max packet length is 1 KB, truncating the rest
# Truncation may occur during a UTF-8 sequence, so we unsafely ignore it
result = tuple(s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in result)
# Notes:
# displaytitle isn't updated after a move, see [[Victory (1996 film)]]
d = dict(
ns = result[0],
title = result[1],
rd_ns = result[2],
rd_title = result[3],
rd_fragment = result[4],
displaytitle= result[6],
categories = (result[5] or '').split('|'),
# derived
dabpage = "All_disambiguation_pages" in (result[5] or ''),
redirects = [],
overlapping = any(cmpr(prefix, title) for prefix in prefixes),
# principle should be better defined, it use to be both primary and principle
principle = text.find(title) < 15 or text.find(title) < text.find(', ') < len(text)*2//3,
)
# get all redirect titles
cursor.execute("""/* dabfix LIMIT:30 */
SELECT page_title FROM page JOIN redirect on (page_id=rd_from) WHERE page_namespace=0 AND rd_namespace=0 AND rd_title=%s
UNION
SELECT rd_title FROM page JOIN redirect on (page_id=rd_from) WHERE page_namespace=0 AND rd_namespace=0 AND page_title=%s
""", (canonicalTitle(title, underscore=True),)*2 )
for tup in cursor:
(s,) = (s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup)
d['redirects'].append(s)
d['overlapping'] |= any(cmpr(prefix, s) for prefix in prefixes)
# debugging
#print "title key: %r " % (title,)
#print 'prefix list: ', prefixes, ' '
#print result, ' '
#print '%r ' % (d,)
links[title] = d
if not (title[0:3].islower() and title.find(':', 1)>0): # [[:ja:北原亞以子]]
if not redlink and not d['title']:
if not title.startswith('Special:'): # [[Special:PrefixIndex/...]]
redlink = title
if d['displaytitle']:
debug("Has DISPLAYTITLE:%(displaytitle)s" %d)
# TODO format pattern for ships
titleSpec = (
("No format", r"^(\d+)_(architecture)$", ''),
("Italics", r"^(\d+)_(albums|books|films|live_albums|musicals|novels|operas|plays|soundtracks|television_films|video_games)$", "''"),
("Quote", r"^(\d+)_(songs|singles|short_stories|television_episodes)$", '"'),
)
def formatLink(m):
# Test cases
# [[A (b)|"A" (b)]] DONE
# "[[A]]" DONE
# "[[A (b)|A]]"
# [[A (b)|A (b)]]
# [[A (b)|"A" (''b'')]] DONE
mark = m.group(4) or m.group(1)
title = m.group('title') # key in links dictionary
target = canonicalTitle(title, underscore=True)
label = "%s%s%s"%(m.group(1), m.group('label') or m.group('title'), m.group(1),)
new_label = label
if not target:
#info('section link')
# [[#section (pinball)]]
return m.group()
elif title not in links:
# Program screwed up somewhere
error("%s\n\nDoes not contain %r" % (repr(links).replace('{', '{\n').replace(',', ',\n'), title))
# throw error
links[title]['rd_title']
elif links[title]['rd_title'] or '#' in title or links[title]['title'] is None:
# Skip these as {{DISPLAYTITLE:}} or Category matching
# are incorrect. This is probably not necessary
# with good title comparer.
# TODO [[Category:Redirected_episode_articles]]?, links->categories contains target page
if '#' in title:
warn("[[%s]] links to a section" % (target,))
elif links[title]['rd_title']:
rd_title = links[title]['rd_title']
rd_fragment = links[title]['rd_fragment']
warn("[[%s]] redirects to [[%s]]" % (target, rd_title+('#'+rd_fragment if rd_fragment else ''),))
else:
# TODO add separate flag for red links, see above with Special:...
warn("[[%s]] is a red link" % (target, ))
elif links[title]['displaytitle']:
# Use {{DISPLAYTITLE:}} whenever available
new_label = wikipedia.html2unicode(links[title]['displaytitle'].replace('', "''").replace(' ', "''"))
#info("{{DISPLAYTITLE:%s}}" % (new_label,))
else:
# Otherwise fall back to category match
for rulename, pattern, c in titleSpec:
for cat in links[title]['categories']:
if re.search(pattern, cat):
if mark and mark != c:
warn("Formatting conflict (%s => %s) with [[Category:%s]]" % (mark, c, cat,))
else:
mark = c
info("%s rule %s matches [[Category:%s]]" % (rulename, pattern, cat,))
(subject, qualifier) = re.search(r'^(.+?)([ _]*\([^()]+\)|)$', target).groups()
new_label = ''.join((mark, subject, mark, qualifier)).replace('_', ' ')
#if label != new_label: debug("Automatic label: %s"%new_label)
def test(s): return canonicalTitle(re.sub(r"''|'''|\"|?\w+\b[^<>]*>", '', s))
# XXX how is [[w (x), y (z)]] handled?
(o_subject, o_qualifier) = re.search(r'^(.+?)([ _]*\([^(\n)]+\)|)$', label).groups()
(n_subject, n_qualifier) = re.search(r'^(.+?)([ _]*\([^(\n)]+\)|)$', new_label).groups()
(t_subject, t_qualifier) = re.search(r'^(.+?)([ _]*\([^(\n)]+\)|)$', target).groups()
#info("\ntarget: <%s>\nnew_label: <%s>"%(target,new_label,))
#info("\nt_subject: <%s> \nt_qualifier: <%s> \no_subject: <%s> \no_qualifier: <%s> \nn_subject: <%s> \nn_qualifier: <%s> \n"%(t_subject, t_qualifier, o_subject, o_qualifier, n_subject, n_qualifier, ))
if test(o_subject) == test(n_subject):
# Copy qualifier styling
# XXX Hack to copy extra formatting from displaytitle while keeping original formatting
if test(o_qualifier) == test(n_qualifier) and len(n_qualifier) <= len(o_qualifier):
n_qualifier = o_qualifier
# [[Flash (Chuck)|Flash (''Chuck'']]
if links[title]['principle'] and '#' not in title and test(n_subject) == test(t_subject):
debug("[[%s]] is the principle link" % title)
# If the target qualifier is not the same
# e.g. (''Buffy'' episode) => (Buffy: The Vampire Slayer episode)
if test(n_qualifier) != test(t_qualifier):
n_qualifier = t_qualifier.replace('_', ' ')
else:
n_qualifier = ''
new_label = ''.join((n_subject, n_qualifier))
repl = "[[%s|%s]]"%(title, new_label)
# TODO [[lower|"Lower"]] => "[[Lower]]"
# TODO avoid capitalizing non-principle links; e.g. [[Pest (organism)|pest]]
repl = re.sub(r"\[\[(.*?)\|(''|\")([^{|}[\]\n]+)\2\]\]", r'\2[[\1|\3]]\2', repl, flags=re.I)
repl = re.sub(r"\[\[( *(.+?) *)\| *\2 *\]\]", r'[[\1]]', repl, flags=re.I)
#TODO rename keys when changing titles
return repl
else:
return m.group()
# \g needs to be the same as the dictionary building one
text = re.sub(r"(''|\"|)\[\[(?P[^{|}[\]\n]+?)[ |]*(?P(?<=\|)(''|\"|).*?|)\]\],??\1", formatLink, text)
if redlink:
disambiguationcategory = dabcatlang.get(site.dbName()+'_p', dabcatlang['enwiki_p'])
# TODO eliminate the current worked on page from the list
cursor.execute("""
/* Trace pages back LIMIT:30 */
SELECT
page_namespace,
page_title,
page_is_redirect,
(SELECT IF(cl_to!='All_set_index_articles','(disambig)','(set-index)') FROM categorylinks WHERE cl_from=page_id AND cl_to IN ("""+','.join(("%s",)*len(disambiguationcategory))+""")) AS dab
FROM page
JOIN pagelinks ON pl_from=page_id
WHERE pl_namespace=%s AND pl_title=%s
""", disambiguationcategory+(0, canonicalTitle(redlink, underscore=True), ))
results = cursor.fetchall()
rows = cursor.rowcount
# TODO create summary from page with highest concentration of red links
if len(results):
printu("The following pages link to %s
" % (CreateLink(redlink, className="new"),) )
print('')
for tup in results:
(ns, title, redirect, dab) = (s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup)
printu(u"%s %s " % (CreateLink(wikipedia.namespaces[ns]+':'+title if ns else title), u"(redirect page)" if redirect else dab or u"", ))
print(" ")
else:
warn("No pages link to [[%s]]" % (redlink,))
if any(ns for (ns, title, redirect, dab) in results if ns<0):
wikipedia.output("Special page")
elif sum(1 for (ns, title, redirect, dab) in results if ns==0 and dab is None)==0:
# Display information about deletion
#
# TODO, improved by checking for an AfD subpage
# WP:CSD#G11 - Blatant advert; AfD - consensus to delete
remove_log_reason_R = re.compile(r'(^|\{\{|\[\[|/wiki/)(Project:|Wikipedia:|WP:|^)(AFD|HOAX|PROD|BLPPROD|Articles[_ ]+for[_ ]+deletion/[^{|}[\]]*|(CSD#|SD#|CSD[ _]+|CSD\]\] |^)(A7|G5|G11))\b', flags=re.I)
cursor.execute('''/* dabfix.parseline LIMIT:30 */
SELECT log_comment
FROM logging_ts_alternative
WHERE log_namespace=%s AND log_title=%s AND log_type="delete"
''', (0, canonicalTitle(redlink, underscore=True),))
m = None
for i, (log_comment,) in enumerate(cursor):
log_comment = log_comment.decode('utf-8')
info("Deletion logs: %(log_comment)s" % locals())
if not m: # and i==0:
m = remove_log_reason_R.search(log_comment)
if m:
text = ""
info('Removing [[%s]], matches %r' % (redlink, m.groups()))
else:
text = ""%text.rstrip()
info('Commenting out red link [[%s]] per [[MOS:DABRL]]: No article links to it' % redlink)
#elif 1 < len(results) < 4: # Unlink
# text = re.sub(r'\[\[(%s)\|?((?<=\|).*?)\]\]'%wikilinkregex(redlink), lambda m: m.group(2) or m.group(1), text)
else:
info("%s pages link here"% (rows,))
else:
# Per MOS:DAB, we unlink non-relevant links
# TODO This should be possibly move above the link removal code
overlapping_links = sum(link['overlapping'] for link in links.itervalues())
if overlapping_links >= 1:
def f(match):
t = match.group(1)
if t in ('floruit', 'Floruit', 'fl.',):
del links[t] # We pretend it does not exist
return match.group()
elif t not in links:
# [[again]] then [[again]] (already deleted)
return match.group(2) or match.group(1)
elif links[t]['overlapping']:
return match.group()
else:
# Unlink text
del links[t]
return match.group(2) or match.group(1)
text = re.sub(r'\[\[(?P[^{|}[\]\n]+?)[ |]*(?P(?<=\|).*?|)\]\]', f, text)
# we may have removed the first link
if primarylink not in links:
primarylink = links.keys()[0]
else:
info("No overlapping link on "+EnglishJoin(["[[%s]]"%key for (key, link) in links.iteritems() if link['overlapping']]))
if len(links)==1 and not any(links.values()[0][cat] for cat in ('dabpage', 'rd_title')):
# FIXME allow [[title]]'s IATA code
m = re.compile(r'''^
((?:[^'"[\],\n()]
|,\ [^[\]|(),\s]*(?=,)
|\[\[[^[\]\n]+\]\]
|"[^'[\],\n{}]+"
|''[^'"[\],\n{}]+''
|"
|''
|'(?!')
)+)
((?:\ -|[-,:]\ |\s)*)
(?:\(+([^(\n)]+)\)+)?
([-.,: ]*)
(.*)
''', re.M | re.X).search(text)
subject, meta, description = '', '', ''
subject = text
if m:
subject, spacer1, meta, spacer2, description = m.groups()
#printu('%s
' % ''.join('%s '%wikipedia.escape(s) for s in m.groups()))
htmlout('%s
' % ''.join('%%s '%tip for tip in ("link", "spacer1", "metadata", "spacer2", "description",)), m.groups())
description = "%s%s"%(spacer1 if spacer1.strip() else spacer2 if spacer2.strip() else ", " if description else '', description)
else:
error('> Unable to separate parts: %s'%text)
# Handy category list
printu(u'Categories: %s
'%' | '.join(CreateLink("Category:"+cat, cat.replace('_', ' ')) for cat in links[primarylink]['categories']))
# FIXME
if len(subject)-subject.find(']]') < 5 and 0 < subject.find('[[') < 8 and description == "":
if any(re.search(CatPlaces,cat) for cat in links[primarylink]['categories']):
wikipedia.output("Geographical places like [[%s]] don't need descriptions" % primarylink)
# TODO add section redirect here
# TODO add disambiguation check here
else:
description = getsummary(primarylink)
# FIXME
# Comics characters introduced in 1977
# 2006 comic debuts
# https://en.wikipedia.org/w/index.php?title=Firefly_(disambiguation)&diff=424199591&oldid=424193235
debut = []
birth = None
death = None
date = []
### print "%r"%(links,)
# category regex for ", a X blah"
# FIXME Handle the case when all we have are missing dates
debut_R = re.compile(r'^(?P\d{1,4})_(albums|architecture|books|films|live_albums|musicals|novels|operas|plays|poems|short_stories|EPs|songs|singles|soundtracks|television_episodes|television_films|video_games|works|manga|anime|sculptures|paintings)$')
descript_R = re.compile(r'^([ ,-]*)(a |an |(?= album | building | book | comic book | film | musical | novel | opera | play | short story | single | song | soundtrack | story | video game | manga | anime | sculpture | painting ))')
for cat in links[primarylink]['categories']:
# FIXME Less hardcoding
if cat.endswith("_births"): birth = getYear(cat) or birth
elif cat.endswith("_deaths"): death = getYear(cat) or death
elif cat=="Living_people": death = ""
elif cat=="Missing_people": death = "" or death
elif cat=="Possibly_living_people": death = ""
elif cat=="Year_of_death_missing": death = "?"
elif cat=="Year_of_birth_missing": birth = "?"
elif cat=="Year_of_death_unknown": death = "?"
elif cat=="Year_of_birth_unknown": birth = "?"
elif debut_R.search(cat):
debut.append(debut_R.search(cat).group("year"))
#info("> Got year %s from [[Category:%s]]"%(debut, cat,))
else:
vague_date = re.search(r'(_|^)(1[6-9]\d\d|20[0-4]\d)(_|$)', cat)
if vague_date:
date.append(vague_date.group(2))
info('Unused date category: %s' % (cat,))
# Subject (Metadata), Description
#if (text.find(primarylink) < 15 or (text.find(primarylink) < text.find(', ') < len(text)*2//3)) and '#' not in primarylink:
if links[primarylink]['principle'] and '#' not in primarylink:
if birth!=None or death!=None:
# People
if birth and birth.isdigit() and 1600 < int(birth) Unused year: %s"%EnglishJoin(debut))
else:
pass
text = subject.strip()
if meta: text += " (%s)"%meta.strip()
if description: text += description.lstrip()
text = text.strip(', ')
if text != line.group().strip():
wikipedia.output(u"\03{lightred}%s\03{default}"%line.group())
while ' ' in text:
text = text.replace(' ', ' ').rstrip()
wikipedia.output(u"\03{lightgreen}%s\03{default}"%text)
print '\n \n'
return text
else:
wikipedia.output(u"\03{lightgreen}%s\03{default}"%text)
print '\n \n'
return line.group()
class Robot(object):
def __init__(self):
self.page = wikipedia.MyPage
self.site = self.page.site()
try:
self.cursor = getConn(self.site.dbName()).cursor()
except:
self.cursor = getConn(self.site.dbName(), host="sql-s1-user").cursor()
global site; site = self.site
global cursor; cursor = self.cursor
self.redirects = []
self.prefixes = set()
self.text = ''
self.page_id = 0
self.preview = False
self.usecommonfixes = wikipedia.SysArgs.get("commonsfixes")!='no'# '#' in self.text
self.enable_wiktionary = wikipedia.SysArgs.get("wiktionary")!='no'
self.disambiguationcategory = dabcatlang.get(self.site.dbName()+'_p', dabcatlang['enwiki_p'])
# Not implemented
self.summaryflags = {}
def setsummary(self, flag, performed_on):
if flag not in self.summaryflags:
self.summaryflags[flag] = []
self.summaryflags[flag].append(performed_on)
def __repr__(self):
return 'dabfix.py '+' '.join(tuple("-%s:%s" % t for t in wikipedia.SysArgs.items()))
def addsection(self, sectionname, new_text):
repl = "\n== %s ==\n%s\n" if re.search(r'(?m)^==[^=]+==$', self.text) else "\n=== %s ===\n%s\n"
if new_text:
debug('Adding %r section (%d lines)' % (sectionname, new_text.count('\n')+1,))
# Place before the last template
# Or the last empty section (avoid == References ==\n {{reflist}})
self.text, count = re.subn(
r'(?s)(?=(\n(=+[^\n]+=+\s*|)\{\{[^{}]+\}\}\s*)+[^{]*?$)',
repl%(sectionname, new_text),
self.text,
1)
if count==0:
self.text += repl%(sectionname, new_text)
# Add to JS list for removing
print '' % (
wikipedia.jsescape(sectionname),
r"[\r\n]+(=+) %s \1[\r\n]+((?![{=}]).*[\r\n]*)*"%re.escape(sectionname),
)
def addprefix(self, t):
iEnd=t.find('_(')
if iEnd==-1: iEnd=None
self.prefixes.add(t[:iEnd])
def reconnect(self, reason=None):
# MySQL drop the connect, we'll need to reconnect
# FIXME this is a hack, it should be handled by a connection routine that also flushes the cursor
global cursor; cursor = getConn(self.site.dbName()).cursor()
def getprefixes(self):
print ''
heading(2, "Redirects")
self.addprefix(self.page.title(underscore=True))
cursor.execute("SELECT page_title FROM page JOIN redirect ON page_id=rd_from WHERE page_namespace=0 and rd_namespace=%s and rd_title=%s", (0, self.page.titleWithoutNamespace(underscore=True),))
rows = cursor.rowcount
if rows:
print '
' if rows > 6 else ''
for (redirect,) in cursor:
self.redirects.append(redirect)
print "%s "%CreateLink(redirect)
self.addprefix(redirect.decode('utf-8'))
print " "
else:
print 'There are no redirects
'
info('Prefixes used for matching: %s' % (EnglishJoin(sorted(self.prefixes)),))
print ' '
# Pre-materialize sub-query for use in Blue and Red link recovery
cursor.execute("""/* dabfix.getprefixes LIMIT:30 */
/* List of links on the page */
SELECT IFNULL(rd_title, pl_title)
FROM page
JOIN pagelinks ON pl_from = page.page_id
LEFT JOIN page AS rd ON rd.page_namespace = pl_namespace AND rd.page_title=pl_title
LEFT JOIN redirect ON rd_from = rd.page_id AND rd_namespace = 0
WHERE page.page_namespace = %s AND page.page_title = %s
AND pl_namespace=0
UNION SELECT %s
""", (self.page.namespace(),)+(self.page.title(underscore=True),)*2)
self.existingLinks = cursor.fetchall()
self.median = 0
try:
cursor.execute("""/* dabfix.getprefixes LIMIT:30 NM */
SELECT COUNT(*) AS FREQ
FROM page AS dab
JOIN pagelinks AS p ON p.pl_from = dab.page_id
JOIN pagelinks AS s ON s.pl_namespace=p.pl_namespace AND s.pl_title=p.pl_title
JOIN page AS blue ON blue.page_namespace=p.pl_namespace AND blue.page_title=p.pl_title
WHERE dab.page_namespace=%s AND dab.page_title=%s
AND p.pl_namespace=%s
GROUP BY blue.page_namespace, blue.page_title
-- GROUP BY p.pl_namespace, p.pl_title
ORDER BY FREQ;
""", (self.page.namespace(), self.page.title(underscore=True), 0,))
results = cursor.fetchall()
if results:
self.median, = results[len(results)//2]
debug('The median linktivity is %d (sample %d links)'%(self.median, len(results),))
except Exception, e:
self.reconnect()
self.median = 20
warn('Unable to determine median linktivity (%s), assuming %d'%(e, self.median))
def getdefinitions(self):
titles_to_look_for = []
# Build a list of title permutation from prefixes list
for title in self.prefixes:
title_lcfirst = title[0:1].lower()+title[1:]
titles_to_look_for += [
# Include first uppercase and first lowercase variants
title, title_lcfirst,
# e.g. [[wikt:-san]]
'-'+title, '-'+title_lcfirst,
# e.g. [[wikt:emo-]]
title+'-', title_lcfirst+'-',
]
# Get existing wiktionary links (XXX:Feb2012:{{Sec link auto}} broken it)
# FIXME page_id is always 0
try:
self.cursor.execute(
"/*LIMIT:30 NM*/SELECT iwl_title FROM iwlinks WHERE iwl_prefix IN ('wikt', 'wiktionary') AND iwl_from=%s",
(self.page_id,)
)
except MySQLdb.OperationalError as (errno, strerror):
if errno not in (1317, 2006):
raise
except Exception as x:
# mystery error
raise BaseException(self.page_id, self.page.title(), repr(x))
else:
for iwl_title in self.cursor:
titles_to_look_for += iwl_title.decode('utf-8')
try:
query_start = time.time()
conn = getConn("%swiktionary"%self.site.language())
wikt_curs = conn.cursor()
# XXX Older servers masquerade utf-8 as latin-1 varchar
wikt_curs.execute("DESCRIBE page page_title")
if 'varchar(255)' in wikt_curs.fetchall()[0]:
wikt_curs.execute("SET NAMES 'latin1'")
wikt_curs.execute("""
/* dabfix.getdefinitions() LIMIT:90 */
-- SELECT DISTINCT derived.page_title FROM (
(
SELECT DISTINCT page.page_title, page.page_len
FROM page
WHERE page.page_namespace=0
AND page.page_title IN ("""+ ','.join(('%s',)*len(titles_to_look_for))+""")
) UNION DISTINCT (
SELECT DISTINCT page.page_title, page.page_len
FROM page
JOIN redirect ON rd_namespace=page.page_namespace AND rd_title=page.page_title
JOIN page AS rd ON rd.page_id=rd_from
WHERE page.page_namespace=0
AND rd.page_namespace=0
AND rd.page_title IN ("""+ ','.join(('%s',)*len(titles_to_look_for))+""")
)
ORDER BY page_len DESC
-- ) AS derived;
""", tuple(s.encode('utf-8') for s in titles_to_look_for)*2)
except MySQLdb.OperationalError as (errno, strerror):
if errno == 1317: # 'Query execution was interrupted'
error("Wiktionary database did not response in time (%d seconds)" % (time.time()-query_start,))
else:
error("Wiktionary OperationalError (%d, %s)" % (errno, strerror))
self.addsection('Wiktionary', "* FAILED %s" % strerror)
return # Abort
#finally:
# with open('./generation_stats/getdefinitions', 'ab') as f:
# f.write('%d\n' % (time.time()-query_start,))
#print ''
wikipedia.logtime("Got Wiktionary links")
# MySQL bug prevents DISTINCT and ORDER BY used together
definitions = [page_title.decode('utf-8') for (page_title, page_len,) in wikt_curs]
conn.close()
return definitions
def addWiktionary(self):
# Doesn't work with mutiple boxes
max_definitions = 5
if not self.enable_wiktionary:
return
heading(2, 'Wiktionary links', className="debug")
definitions = self.getdefinitions()
print ''
if definitions:
wiktionarylinks = "{{wiktionary|%s}}" % '|'.join(sorted(definitions[0:max_definitions])).replace('_', ' ')
del definitions[0:max_definitions]
htmlout("Definitions box: %s", (wiktionarylinks,))
if definitions:
wikipedia.output("Excluding %s since {{Wiktionary}} is limit to %s definitions" % (
EnglishJoin(["[[wikt:%s]]"%page_title for page_title in definitions]),
max_definitions,
))
else:
wiktionarylinks = ""
wikipedia.output("No definitions from wiktionary")
self.text = re.sub(r'(?i)(\{\{Infobox(?:[^{|}]|\{\{[^{}]+?\}\}|\|(?!\s*wikt))*?)(?:\s*\|\s*wikt\s*=|)(\s*)\}\}', r'\1\2| wikt = {{wiktionary}}\2}}', self.text)
m = re.search(r'(?sx)\{\{([Ww]iktionary)(\s*\|.*?|)\}\}[ ]*', self.text)
if m: self.text = m.re.sub(wiktionarylinks, self.text)
elif not wiktionarylinks: pass
else: self.text = wiktionarylinks + '\n' + self.text
print '
'
def doubleredirect(self):
# XXX what this suppose to do?
cursor.execute("""
/* dabfix.doubleredirect() LIMIT:30 */
SELECT link.page_title, targt.page_title, dbl.rd_title
FROM page AS dab
JOIN pagelinks ON pl_from = dab.page_id
JOIN page AS link ON link.page_namespace=pl_namespace AND link.page_title=pl_title
JOIN redirect ON redirect.rd_from = link.page_id
JOIN page AS targt ON targt.page_namespace=redirect.rd_namespace AND targt.page_title=redirect.rd_title
JOIN redirect AS dbl ON dbl.rd_from = targt.page_id
WHERE dab.page_namespace = 0
AND dab.page_title = %s
AND targt.page_is_redirect = 1
""", (self.page.title(underscore=True),))
results = cursor.fetchall()
if results:
heading(3, "Double redirects")
print ''
for t in results:
print "%s → %s → %s "%tuple((CreateLink(title) for title in t))
print ' '
def primary_entry(self):
# TODO Determine which topics go where on multi-topic primary pages
hatnotes = ("Hatnote",)
primlink = {}
cursor.execute("""/* dabfix.primary_entry LIMIT:30 */
SELECT GROUP_CONCAT(DISTINCT rdpagein.page_title SEPARATOR '|'),
pagein.page_title,
dab.page_title,
pageout.page_title,
rdpageout.page_title,
EXISTS (SELECT 1
FROM templatelinks
WHERE tl_from = pagein.page_id
AND tl_namespace = 10
AND tl_title IN ("""+','.join(("%s",)*len(hatnotes))+""")
) AS hatnote
FROM page AS dab
JOIN pagelinks AS linkin ON linkin.pl_title = dab.page_title AND linkin.pl_namespace = 0
JOIN page AS pagein ON pagein.page_id = linkin.pl_from AND pagein.page_namespace = 0
LEFT JOIN redirect AS rdin ON rdin.rd_title = pagein.page_title AND rdin.rd_namespace = 0
LEFT JOIN page AS rdpagein ON rdpagein.page_id = rdin.rd_from AND rdpagein.page_namespace = 0
JOIN pagelinks AS linkout ON linkout.pl_from = dab.page_id AND linkout.pl_namespace = 0
JOIN page AS pageout ON pageout.page_title = linkout.pl_title AND pageout.page_namespace = 0
LEFT JOIN redirect AS rdout ON rdout.rd_from = pageout.page_id AND rdout.rd_namespace = 0
LEFT JOIN page AS rdpageout ON rdpageout.page_title = rdout.rd_title AND rdpageout.page_namespace = 0
WHERE dab.page_namespace = %s
AND dab.page_title = %s
AND (
pagein.page_id = pageout.page_id
OR pagein.page_id = rdpageout.page_id
OR rdpagein.page_id = pageout.page_id
OR rdpagein.page_id = rdpageout.page_id
)
GROUP BY pagein.page_title
LIMIT 25 /* should be enough */;
""", hatnotes+(self.page.namespace(), self.page.title(underscore=True),))
rows = cursor.rowcount
def checkTitle(a, b):
# does a match b
if a == b:
return True
# elif a.find(b+'_(')==0:
# return True
# elif a.find(b+',')==0:
# return True
else:
return False
oldset = self.prefixes.copy()
if rows:
heading(2, "Primary topic", className="debug")
debug('Analyzing circular links')
for tup in cursor:
(inrd, inpage, dabpage, outpage, outrd, hatnote) = tuple(s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup)
print ''
# XXX What are we doing here?
if any(checkTitle(inpage, prefix) or inrd and any(checkTitle(s, prefix) for s in inrd.split('|')) for prefix in self.prefixes):
if hatnote: # is {{dablink}} on the page?
wikipedia.output("[[%s]] should be listed as a primary topic"%inpage.replace('_', ' '))
primlink[inpage] = True
#
if inrd:
for x in inrd.split('|'):
self.addprefix(x)
else:
self.addprefix(inpage)
else:
warn("Missing hatnote on [[%s]]"%inpage)
newprefixes = sorted(self.prefixes.difference(oldset))
if newprefixes:
info("Adding prefixes: %s" % EnglishJoin(newprefixes))
for link in primlink:
if re.search(r"'''\[\[%s\]\].*" % wikilinkregex(link), self.text):
wikipedia.output('[[%s]] is already bolded' % link.replace('_', ' '))
else:
self.text = re.sub(r"(?sm)(.*)^[#*]+ *(\[\[%s\]\])[, ]*([^\n]*)\n?" % wikilinkregex(link), r"'''\2''' is \3.\n\n\1", self.text)
def bluelinks(self):
def parameters_links(prefixes):
pf_list = ()
for prefix in prefixes:
prefix_esc = likeescape(prefix)
pf_list += (prefix_esc+'\\_(%)', prefix_esc+',\\_%', prefix_esc+'\\_(%),\\_%',)
return tuple(pf_list)
def parameters_names(prefixes):
list = ()
for prefix in prefixes:
prefix_esc = likeescape(prefix)
# Acronym
if prefix.isupper() and prefix.isalpha(): # No spaces/digits/_ = Initials
list += ('%\\_'.join(prefix)+'%', prefix+'\\_%',)
elif prefix.count('_') > 2 or re.search(ur'[\W\d]', prefix.decode('utf-8') if isinstance(prefix, bytes) else prefix, flags=re.U):
# Multiple spaces or [0-9], puncuation, symbols
list += ("", "", )
# Two names
elif prefix.count('_') == 1: # First_Last
# TODO X_FIRST_LAST
# FIXME X_(Y) does not work
list += (prefix.replace('_', r'\_%\_'), prefix.replace('_', r'%\_')+r'\_(%)', )
elif prefix.count('_') == 2:
list += (prefix.replace('.', '').replace('_', r'\_%\_'), prefix.replace('_', r'%\_')+r'\_(%)', )
else: # First or Last
list += (prefix_esc+'\\_%', "%\\_"+prefix_esc,)
return tuple(list)
def run_query(title_search=(), name_search=("",), timeout=90):
cursor.execute("/* dabfix.bluelinks SLOW_OK LIMIT:"+str(timeout)+""" NM */
SELECT
page_namespace,
page_title,
rd_namespace,
rd_title,
/* TODO query might be faster with join rather then subqueries */
(SELECT COUNT(*) FROM pagelinks JOIN categorylinks ON cl_from=pl_from WHERE pl_namespace=page_namespace AND pl_title=page_title AND cl_to IN ("""+','.join(('%s',)*len(self.disambiguationcategory))+""")) AS "dabcount",
backlinks,
/* XXX or maybe categories are better */
EXISTS (SELECT 1
FROM categorylinks
WHERE cl_to REGEXP "Living_people|.*_births$|.*_deaths$"
AND cl_from = page_id
) AS Person,
EXISTS (SELECT 1
FROM categorylinks
WHERE cl_to REGEXP %s
AND cl_from = page_id
) AS Geography,
IFNULL(pp_value, page_title) AS Sortkey
FROM (
SELECT page_id, page_namespace, page_title, rd_namespace, rd_title, IFNULL(rd_title, page_title) AS target, COUNT(pl_from) AS "backlinks", pp_value
FROM (SELECT page_id, page_namespace, page_title
FROM page
WHERE page_namespace=0
AND (
"""
+' OR '.join(('page_title=%s',)*len(self.prefixes))
+'\n OR \n'
+' OR '.join(('page_title LIKE %s',)*len(title_search))
+"""\n OR \n(("""
+' OR '.join(('page_title LIKE %s',)*len(name_search))
+""")
AND CAST(page_title AS CHAR CHARACTER SET utf8) REGEXP "^[-\'`.[:alpha:]]+(_[[:upper:]][-\'`.[:alpha:]]*)?_[[:upper:]][-\'`[:alpha:]]+$"
)
)) AS searched_pages
LEFT JOIN pagelinks ON pl_namespace=0 AND pl_title=page_title
LEFT JOIN redirect ON rd_from=page_id AND rd_namespace=0
LEFT JOIN page_props ON pp_page=page_id AND pp_propname="defaultsort"
GROUP BY page_id
ORDER BY
/* Parentheses terms first */
INSTR(target, '_(') OR INSTR(target, ',') DESC,
target,
rd_title IS NOT NULL ASC,
backlinks DESC
LIMIT 25000
) AS r
GROUP BY target
HAVING target NOT IN ("""+','.join(('%s',)*len(self.existingLinks))+""")
ORDER BY INSTR(page_title, '_(') AND rd_title IS NULL DESC, FLOOR(LOG10(backlinks)) DESC, Person DESC, Geography DESC, Sortkey ASC
LIMIT 500
""", self.disambiguationcategory+(CatPlaces,)+tuple(self.prefixes)+title_search+name_search+zip(*self.existingLinks).pop())
try:
query_start = time.time()
run_query(title_search=parameters_links(self.prefixes), name_search=parameters_names(self.prefixes))
except MySQLdb.OperationalError as (errno, strerror):
self.reconnect()
if errno == 1317: # 'Query execution was interrupted'
error("Blue link search: Timed out (%d seconds)"% (time.time()-query_start,))
wikipedia.logtime("Blue link search: Retry with simpler query")
try:
run_query(title_search=parameters_links(self.prefixes), name_search=tuple(likeescape(prefix)+"\\_%" for prefix in self.prefixes), timeout=240)
except MySQLdb.OperationalError as (errno, strerror):
error("Blue link search (simple): Timed out (%d seconds)"% (time.time()-query_start,))
wikipedia.logtime("Blue link search (simple): Timed out")
self.addsection('Blue link recovery', "* FAILED %s" % strerror)
return
else:
raise
#finally:
# with open('./generation_stats/getbluelinks', 'a') as f:
# f.write('%d\n' % (time.time()-query_start,))
addlinks = dict(
blue = {},
names = {},
places = {},
people = {},
geo = {},
)
rows = cursor.rowcount
if rows:
heading(2, 'Blue link recovery', className="debug")
for tup in cursor:
tup = tuple(s.decode('utf-8') if isinstance(s, bytes) else s for s in tup)
(page_ns, page, rd_ns, rd_target, dabcount, count, people, geography, sortkey) = tup
# Re-link removed links
self.text,success = re.subn(RelinkText % wikilinkregex(tup[1]), r'\1[[\2]]\3', self.text, 2)
if success:
continue
# otherwise
if people:
listkey = 'people'
elif geography:
listkey = 'geo'
elif '(' in page or '-' in page:
listkey = 'blue'
elif ',' in page:
listkey = 'places'
else:
listkey = 'names'
key = rd_target or page
addlinks[listkey][key] = (page, dabcount, count, people, geography, sortkey)
extra = set([])
for (listkey, listname, maxsize) in (
('blue', "blue links", 25,),
('people', "People", 40,),
('geo', "Places", 40,),
('names', "names", 20,),
('places', "place-like names", 15,),
):
addlist = addlinks[listkey]
if not addlist: continue
# TODO custom sort key for People
# XXX better if sorted by link count (c)
# addsort = sorted(addlist, key=lamdba tup: tup[2], reverse=True)
most = sorted((t for k,(t,d,c,p,g,s) in addlist.iteritems()), key=lambda t:addlist.get(t, (t,))[-1])
good = sorted((t for k,(t,d,c,p,g,s) in addlist.iteritems() if k==t and d==0), key=lambda k:addlist[k][5])
top = sorted((t for k,(t,d,c,p,g,s) in addlist.iteritems() if k==t and c>=self.median and d==0), key=lambda k:addlist[k][5])
debug(u"%s pages found: %d [%d redirects/dab-linked, %d links ≥ median (%d)]"\
%(listname.capitalize(), len(most), len(addlist)-len(good), len(top), self.median,))
extra |= set(addlist.keys())
if 0 < len(most) <= maxsize // 2:
self.addsection('%d recovered %s%s'%(len(most), listname, ' (All redirects)' if len(good)==0 else ''), bullets(most))
extra -= set(addlist.keys())
extra -= set(most)
elif 0 < len(good) <= maxsize:
self.addsection('recovered %s (%d non-redirects)'%(listname, len(good)), bullets(good))
extra -= set(good)
elif 0 < len(top) <= maxsize:
self.addsection('recovered %s (%d high value)'%(listname, len(top)), bullets(top))
extra -= set(top)
else:
wikipedia.output("Too many %s to add (%d/%d)" % (listname, len(addlist), maxsize))
if len(extra):
info("The following titles were NOT included because they are redirects, linked from another disambiguation page, or were linked less than the median on this page.")
print('' if rows > 6 else '')
for key in sorted(extra):
for listkey in addlinks:
if key in addlinks[listkey]:
(title, dabcount, count, people, geography, sortkey) = addlinks[listkey][key]
x = []
if title!=key:
x.append("redirects to %s" % CreateLink(key))
if dabcount:
x.append("linked from %d disambiguation pages" % dabcount)
if count != dabcount and dabcount > 0:
x.append("%d links" % count)
printu('%s%s '%(CreateLink(title, className='mw-redirect' if title!=key else ''), ' (%s)'%EnglishJoin(x) if x else '',))
print(' ')
def redlinks(self):
addlinks = {}
addnames = {}
addmissing = {}
addmisname = {}
addtemplatelinks = {}
missing_P = r'missing|encyclopedia|redlinks|wikiproject(?!.*Red_Link_Recovery|.*COIReports)'
missing_R = re.compile(missing_P, re.I)
red_params = 'pl_title LIKE %s OR pl_title LIKE %s OR pl_title LIKE %s OR ((pl_title LIKE %s) AND CAST(pl_title AS CHAR CHARACTER SET utf8) REGEXP "^[-\'`.[:alpha:]]+(_[[:upper:]][-\'`.[:alpha:]]*)?_[[:upper:]][-\'`[:alpha:]]+$")'
title_search = ()
name_search = ()
#'''
# TODO port over to dab solver
# https://toolserver.org/~dispenser/cgi-bin/dab_solver.py/Baller_Blockin'_(film)
# https://toolserver.org/~dispenser/cgi-bin/dabfix.py/Manuel_Vazquez
# Missing Manuel Vázquez (accient on a from redirect)
for prefix in self.prefixes:
#
title_search += (
# begins
likeescape(prefix)+'\\_(%)',
likeescape(prefix)+',\\_%',
likeescape(prefix)+':\\_%',
likeescape(prefix)+'\\_(%),\\_%',
# trails
'%:\_'+likeescape(prefix),
)
#
if prefix.count('_') > 1 or not re.search(ur'[\W\d]', prefix, flags=re.U):
name_search += (
likeescape(prefix)+'\\_%',
"%\\_"+likeescape(prefix),
)
# Two names
elif prefix.count('_') == 1: # First_Last
# TODO X_FIRST_LAST
# #FIXME X_(Y) does not work
name_search += (
likeescape(prefix).replace('\\_', '\\_%\\_'),
likeescape(prefix).replace('\\_', '\\_%')+"\\_(%)",
)
elif prefix.isupper() and prefix.isalpha(): # Initials + no underscore
name_search += (
'%\\_'.join(prefix)+'%', likeescape(prefix)+'\\_%',
)
# XXX what's the last test for? numbers and symbols? does it match the unicode chars?
#
def redlink_fulltext_searcher(dbname, namespace, pfx_list):
if namespace!=0 or not dbname.startswith('enwiki'):
return ()
try:
cursor.execute(' UNION '.join(("""(
/* related.redlink_fulltext_searcher LIMIT:1 */
SELECT REPLACE(rls_title_ft, ' ', '_')
FROM u_dispenser_p.redlinks_enwiki
WHERE MATCH (rls_title_ft) AGAINST (%s IN NATURAL LANGUAGE MODE)
/* ORDER BY is implicit */
LIMIT 8
)""",)*len(pfx_list)), tuple(p.replace('_', ' ') for p in pfx_list))
return zip(*cursor.fetchall()).pop()
except Exception as e:
self.reconnect()
error("Redlink FullText search: %r"%(e,))
return ()
# print repr(prefixes)
print ''
print ' '.join(redlink_fulltext_searcher(self.site.dbName(), 0, self.prefixes))
print '
'
# print title_search
# print ' '
# print name_search
# print ' '
#'''
def redLinkParameters(prefixes):
list = ()
for p in prefixes:
list += (likeescape(p)+'\\_(%)', likeescape(p)+',\\_%', likeescape(p)+'\\_(%),\\_%', likeescape(p)+'\\_%',)# '%\\_'+likeescape(p),)
return list
try:
query_start = time.time()
cursor.execute("""
/* dabfix.redlinks() LIMIT:90 */
SELECT
pl_namespace,
pl_title,
ns_name, /* text for pl_namespace above */
COUNT(*) AS link_count,
SUM(ref.page_namespace = pl_namespace) AS ns_links,
(SELECT GROUP_CONCAT(DISTINCT DATE_FORMAT(log_timestamp, "%%b %%Y") SEPARATOR ", ")
FROM logging_ts_alternative
WHERE log_namespace = pl_namespace AND log_title = pl_title
AND log_action = "delete"
) AS log_deletes,
GROUP_CONCAT(ref.page_namespace SEPARATOR "|") AS ns_context,
GROUP_CONCAT(ref.page_title SEPARATOR "|") AS context,
SUM((SELECT STRAIGHT_JOIN COUNT(*) FROM templatelinks
/* MySQL optimizer on some TS database configurations, likely due to bad
* statistics. See TS-1190, workaround using STRAIGHT_JOIN
*/
JOIN page AS trans ON trans.page_id=tl_from AND trans.page_namespace=0
WHERE tl_namespace=ref.page_namespace AND tl_title=ref.page_title
)) AS trans_count
-- , SUM(ref.page_title REGEXP "missing|encyclopedia|redlinks")
FROM page AS ref
JOIN pagelinks ON pl_from = ref.page_id
LEFT JOIN categorylinks ON cl_from = ref.page_id AND cl_to IN ("""+','.join(('%s',)*len(self.disambiguationcategory))+""")
JOIN toolserver.namespace ON dbname = (SELECT DATABASE()) AND ns_id = pl_namespace
LEFT JOIN page AS pl ON pl.page_namespace=pl_namespace AND pl.page_title=pl_title
WHERE pl.page_id IS NULL
AND pl_title NOT IN ("""+','.join(("%s",)*len(self.existingLinks))+""")
AND ("""+' OR '.join((red_params,)*len(self.prefixes))+""")
AND pl_namespace = 0
/* Content namespaces only */
AND ref.page_namespace IN (0, 2, 4, 6, 8, 10, 12, 14)
/* No disambiguation pages (also hack to correct ns_links) */
AND cl_to IS NULL
GROUP BY pl_namespace, pl_title
-- HAVING log_deletes IS NULL
ORDER BY
/* Parentheses terms first */
INSTR(pl_title,'_(') OR INSTR(pl_title,',') DESC,
/* Article backlink count in graduations */
FLOOR(LOG2(SUM(ref.page_namespace=0))) DESC,
/* Put deleted items at the bottom */
log_deletes IS NOT NULL,
-- /* Case-insensitive alphabetize */
-- pl_title_ci ASC
pl_title ASC
""", self.disambiguationcategory+zip(*self.existingLinks).pop()+redLinkParameters(self.prefixes))
except MySQLdb.OperationalError as (errno, strerror):
self.reconnect()
if errno == 1317: # 'Query execution was interrupted'
error("Red link search timed out (%d seconds)"% (time.time()-query_start,))
self.addsection('Red links', "* FAILED %s" % strerror)
return # Abort
else:
raise
#finally:
# with open('./generation_stats/getredlinks', 'a') as f:
# f.write('%d\n' % (time.time()-query_start,))
rows = cursor.rowcount
if rows:
heading(2, "Red link recovery", className="debug")
print('' if rows >= 6 else '')
for tup in cursor:
# FIXME UTF-8 strings are sometimes cut short
(pl_namespace, pl_title, ns_name, links, ns_links, log_deletes, ns_context, context, trans_count) = tuple(s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup)
# will we add the link?
# TODO create blacklist for Wikipedia:Templates with red links/xxx
# re.search(r'(?mi)^\*+[ \'"]*(%s)(\b|[,|\-])' % wikilinkregex(pl_title.replace('_', ' ')), self.text)
intext = re.search(RelinkText % wikilinkregex(pl_title), self.text)
missing = missing_R.search(context)
if log_deletes and ns_links==0:
# FIXME Combine with better code from other places to classify deleted links
continue
elif log_deletes:
extra_info_html = ', deleted %s ' % log_deletes
elif intext:
extra_info_html = '(Linking text) '
self.text, success = intext.re.subn(r'\1[[\2]]\3', self.text, 2)
elif missing:
extra_info_html = '(Missing article) '
if any(c in pl_title for c in (',_','_(','-',)) or '_' not in pl_title:
addmissing[pl_title] = True
else:
addmisname[pl_title] = True
elif ns_links>=2 and links>=4:
if '10' in ns_context.split('|'):
extra_info_html = '(%d transclusions) ' % (trans_count,)
try:
addtemplatelinks[pl_title] = context.split('|')[ns_context.split('|').index('10')]
except IndexError:
addtemplatelinks[pl_title] = ''
else:
extra_info_html = '(Recover) '
if any(c in pl_title for c in (',_','_(','-',)) or '_' not in pl_title:
addlinks[pl_title] = True
else:
addnames[pl_title] = True
else:
extra_info_html = ''
# Render HTML list
print '' % ('' if (ns_links-trans_count>=2 and links-trans_count>=4 or links>=rows//5) or intext or missing else 'debug', )
printu(CreateLink(pl_title, className="new"))
if ns_context != '0': # Avoid "1 article link"
printu(' (%s)'%CreateLink("Special:WhatLinksHere/%s"%pl_title, "%s%d %s%s"%(
'%d article link%s / '%(ns_links-trans_count if ns_links>=trans_count else ns_links, '' if ns_links==1 else 's') if links > ns_links > 0 else '',
links,
'article link' if links==ns_links else 'link',
'' if links==1 else 's',
), addAttribute=' onclick="toggleNode(this.parentNode.nextSibling);return false;"', className="rl_expand")
)
print extra_info_html
try:
if ns_context != '0':
printu(' '% (
' style="display:none"' if ns_context != '0' else '',
' \n'.join(CreateLink(t if n=='0' else (wikipedia.namespaces[int(n)] or '')+':'+t) for n,t in zip(ns_context.split('|'), context.split('|'),)),
))
else:
printu(' from %s'% (CreateLink(context),))
except Exception as e: # FIXME specific exception
printu('')
error('excessed max packet size (%r)' % e)
print ' '
if rows:
print ' '
for (addlist, listname, maxsize) in (
(addlinks, "red links", 25),
(addnames, "red link names", 10),
(addmissing,"missing articles", 10),
(addmisname,"missing names", 10),
):
if not addlist:
continue
elif len(addlist) <= maxsize:
self.addsection('%s' % listname, bullets(addlist.keys()))
else:
wikipedia.output("Too many %s to add (%d/%d)" % (listname, len(addlist), maxsize))
if addtemplatelinks:
if len(addtemplatelinks)<=5 and addnames and addlinks:
self.addsection('templated red links',
''.join(("* [[%s]], [[Template:%s|]]\n"%(link, template)).replace('_',' ') for link, template in addtemplatelinks.items())
)
else:
wikipedia.output("Not adding %d templated red links" % len(addtemplatelinks))
def addtemplatelinks(self):
# TODO avoid {{in title|one_(disambiguation)}}
self.addsection('Copy and Paste stuff', '''
* Only use these if applicable (i.e. you checked)
* {{in title}}
* {{Lookfrom}}
'''.strip())
def seealso(self):
try:
cursor.execute("""/* dabfix.seealso LIMIT:30 */
SELECT page_title, page_title IN (
SELECT IFNULL(rd_title, pl_title)
FROM page
JOIN pagelinks ON pl_from = page.page_id
JOIN page AS pl ON pl.page_namespace=pl_namespace AND pl.page_title=pl_title
LEFT JOIN redirect ON rd_namespace=pl_namespace AND rd_from=pl.page_id
WHERE pl_namespace=0
AND page.page_namespace=%s AND page.page_title = %s
) AS Linked
FROM (
SELECT page.page_title
FROM page
JOIN categorylinks ON cl_from = page.page_id
JOIN pagelinks ON page.page_id = pl_from
JOIN page AS rd ON rd.page_namespace = pl_namespace AND rd.page_title = pl_title
JOIN redirect ON rd_from = rd.page_id
WHERE page.page_namespace = 0
AND rd_namespace = %s AND rd_title = %s
AND cl_to IN ("""+",".join(("%s",)*len(self.disambiguationcategory))+""")
UNION
SELECT page_title
FROM page
JOIN categorylinks ON cl_from = page.page_id
JOIN pagelinks ON pl_from = page.page_id
WHERE page.page_namespace = 0
AND pl_namespace = %s AND pl_title = %s
AND cl_to IN ("""+",".join(("%s",)*len(self.disambiguationcategory))+""")
) AS r;
""", (self.page.namespace(), self.page.title(underscore=True),)+((self.page.namespace(), self.page.title(underscore=True),)+self.disambiguationcategory)*2)
except Exception as e:
self.reconnect()
error("See also: %r"%(e,))
self.addsection('Auto-See also', "* FAILED %s" % e)
return
rows = cursor.rowcount
if rows:
heading(2, "See also links", className="debug")
#wikipedia.output("Other disambiguation pages which link here")
addlinks = {}
print('' if rows > 6 else '')
for title, exists in cursor:
print('%s (%s) ' % (CreateLink(title), "already linked" if exists else "may need to be linked"))
if not exists:
addlinks[title] = title.decode('utf-8').replace('_', ' ')
print(' ')
self.addsection('EXPERIMENTAL See also', bullets(addlinks.values()))
def addInterwikis(self):
"""
SELECT *
FROM page
LEFT JOIN iwlinks ON iwl_from=page_id
WHERE page_namespace=0 AND page_title="SAN";
"""
pass
def mosfixes(self):
# Rename headings
for old, new in (
('Real people', 'People'),
('Historical persons', 'People'),
('Persons', 'People'),
('Fictional characters', 'Characters'),
('Movies', 'Films'),
('Computer gaming', 'Video gaming'),
('Computer games', 'Video games'),
('First name', 'Given name'),
('Last name', 'Surname'),
('Geography', 'Places'),
('Geographical locations', 'Places'),
('Place names', 'Places'),
('Miscellaneous', 'Other uses'),
('Ohter usages', 'Other uses'),
):
self.text = re.sub(r'(?mi)^(=+ *)%s(?= *=+ *$)'%re.escape(old), r'\g<1>%s'%new, self.text)
# Use actual headings instead of bolding
def makeheading(m):
# luckly the re module uses a temporary string
if '\n====' in self.text[:m.end()]: depth = '====='
elif '\n===' in self.text[:m.end()]: depth = '===='
elif '\n==' in self.text[:m.end()]: depth = '==='
elif '\n=' in self.text: depth = '=='
# Use level 3 in no other headers are present
else: depth = '==='
return ' '.join((depth, m.group(4)[0].upper()+m.group(4)[1:].replace("'''", ''), depth))
self.text = re.sub(r"(?mi)^('''|;|In '''|As a? *''') *(In |As |)(the |)(([\w\s]|''' and ''')+?) *('''|:| )*$", makeheading, self.text)
# copied from [[MediaWiki:Disambiguationspage]]
setindex_templates = ("SIA", "Given name", "Hawaiiindex", "Mountainindex",
"Plant common name", "Disambig-plants", "Roadindex", "Shipindex",
"Sportindex", "Surname",
)
disambig_template = (
"Disambiguation",
)
cursor.execute("""/* dabfix.mosfixes LIMIT:30 */
SELECT 1
FROM page
JOIN templatelinks ON tl_from=page_id
JOIN pagelinks ON pl_namespace=tl_namespace AND pl_title=tl_title
WHERE page_namespace=%s and page_title=%s
AND pl_from = (SELECT page_id FROM page WHERE page_namespace=8 AND page_title="Disambiguationspage")
""", (self.page.namespace(), self.page.title(underscore=True)))
if cursor.fetchall():
pass
elif not re.search(r'\{\{(%s)'%'|'.join(wikilinkregex(title) for title in (disambig_template + setindex_templates)), self.text):
if 'given names]]' in self.text:
addtemplate = "{{given name}}"
elif 'surnames]]' in self.text:
addtemplate = "{{surname}}"
else:
addtemplate = "{{disambiguation}}"
self.text = re.sub(r'(?is)\n*((\n\[\[[^[\]]+\]\]\s*|\n\{\{[^{|}[\]]+\}\}\s*)*)$', r"\n\n%s\n\1"%(addtemplate,), self.text)
def template_redirect(self):
# List template redirects
BypassTemplateRedirect= (
# Used by our software
"Disambiguation", "Hndis", "Geodis",
"FORCETOC", "TOC_left", "TOC_right",
"Wiktionary",
#"In_title",
# Used in parseline()
"Spaced_ndash", "Ndash", "Mdash",
)
cursor.execute("""/* dabfix.template_redirect LIMIT:30 */
SELECT rd_title, tl_title, (
SELECT 1
FROM page
JOIN pagelinks ON pl_from=page_id
WHERE pl_namespace=10 AND pl_title=rd_title
AND page_namespace=%s AND page_title=%s
) AS awb_bypass
FROM page
JOIN templatelinks ON tl_from=page_id AND tl_namespace=10
JOIN page AS tpl ON tpl.page_namespace=tl_namespace AND tpl.page_title=tl_title
JOIN redirect ON rd_from=tpl.page_id AND rd_namespace=10
WHERE page.page_namespace=%s AND page.page_title=%s
""", (4, "AutoWikiBrowser/Template_redirects", self.page.namespace(), self.page.title(underscore=True)))
for tup in cursor:
(template, redirect, awb_bypass) = tuple(s.decode('utf-8') if isinstance(s, bytes) else s for s in tup)
if template in BypassTemplateRedirect or awb_bypass:
# TODO Capitalize if newlines are present in the template
repl = template.replace('_', ' ').replace('\\', '\\\\')
if True: repl = repl[0:1].lower()+repl[1:]
self.text = re.sub(r'(?<=\{\{)\s*([Tt]emplate\s*:\s*|)%s\s*(?=\||\}\})' % wikilinkregex(redirect), repl, self.text)
else:
wikipedia.output("Transclusion redirect: [[Template:%s]] to [[Template:%s]]" % (redirect, template,))
# Remove TOCs and correctly added back with JS
self.text = re.sub(r'(?i)\n?\{\{(TOC[_ ]+left|TOC[_ ]+right)\}\}\n?|\n?__(TOC|NOTOC|FORCETOC)__\n?', '\n', self.text)
# TODO implement sort key
#self.text = re.sub(r'\{\{hndis\s*(\|(?:\s*name\s*=\s*|(?=[^{|}=]*[|}]))([^{|}]*?))\s*\}\}', r'{{hndis|\2}}', self.text)
# |name=Last, First
# Use {{dismabig}} paramter features
self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:surnames\]\]', r'\1|surname}}\2', self.text)
self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:given names\]\]', r'\1|given name}}\2', self.text)
self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:Place name disambiguation pages\]\]', r'\1|geo}}\2', self.text)
self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:Airport disambiguation\]\]', r'\1|airport}}\2', self.text)
def intro(self):
self.text = re.sub(r'(?i)\{\{(?:Disamb1|Mayrefer|Mayreferto|May refer to|Refer|Refers)(?=\s*[|}])', '{{subst:refer', self.text)
if not ("'''" in self.text or "{{subst:refer" in self.text):
#TODO should discard minor case varriants
terms = {}
for prefix in self.prefixes:
#term = "'''%s%s'''" % (prefix[0], prefix[1:].replace('_', ' '),)
term = "'''%s'''" % prefix.replace('_', ' ')
base = strip_variations(term)
if base not in terms:
terms[base]=term
termstext = EnglishJoin(sorted(terms.values()))
wikipedia.output('Adding "%s may refer to:"'%termstext)
self.text = "%s may refer to:\n\n"%termstext + self.text
# It may also refer to:
# '''X''' refers
# '''X''' can mean:
# '''X''' can mean the following things.
# '''X''' can refer to:
# '''X''' can refer to either:
# '''X''' can refer to several things:
# '''X''' can refer to the following:
# '''X''' can be used to refer to:
# '''X''' could mean:
# '''X''' is the name of:
# '''X''' may also signify:
# '''X''' may be:
# '''X''' may mean:
# '''X''' may refer to any of the following:
# '''X''' may represent
# '''X''' may refer to more than one thing:
# '''X''' means following
# '''X''' might refer to one of the following:
# '''X''' has several meanings
# '''X''' has various meanings:
# '''X''' has the following meanings:
# A '''X''' can be:
# The term '''X''' may refer to any one of the following:
# The expression '''X''' can refer to:
# Do you mean...
# '''X''' may refer to several places:
# '''X''' may refer to several places in [[L]]
# '''X''' may be an abbreviation for:
# '''X''' is an abbreviation for:
# '''X''' is a [[three-letter acronym]] that may refer to:
# '''X''' is a [[TLA|three-letter abbreviation]] and represents
# '''X''' is an abbreviation that may stand for:
# '''X''' as an [[abbreviation]] may refer to:
# '''X''' can stand for:
# The [[abbreviation]] '''X''' can be:
def refer(m):
if re.search(r'(?i)\{\{(Hndis|Hndab|Hndisambig|Bio-dab)', self.text):
return m.expand(r'\g is\g the name of:')
elif re.search(r'(?i)\{\{(Geo-?dis|Geodab)', self.text):
return m.expand(r'\g may\g refer to several places:')
elif re.search(r'\b(acronym|abbreviation|inital|stand)\b', m.group()):
# or self.page.title().isupper():
return m.expand(r'\g may\g stand for:')
else:
return m.expand(r'\g may\g refer to:')
self.text = re.sub(ur"^(?:A |And |As an? |The |expression |term |\[*three[ -]letter acronym\]* |\[*acronym\]* |\[*abbreviation\]* |)*(?P('''[^{|}[\]\n']+?''',?( or|) *)+|It|Did you|Do you)( can| could| is| is an?| as| as an?| has| may| might|)(?P also|)([ \-]+(\[\[|\[\[[^{|}[\]<\n>]+\||)(be|be an?|means?|meanings?|refers?|represents?|signify|the name|various|used to|several|an?|abbreviation|two|three|four|five|letter|acronym|initialism|stand|to|of|for|and|that|may|one of|any of|any one of|either|following|the following|several things|more than|one thing|several persons|several people|meanings?|several places)(\]\]|\b))+( in \[\[[^{|}[\]\n]+\]\]|)( \w+|)[:;. ]*$", refer, self.text, flags=re.M | re.U | re.I)
def referTemplate(m):
#if m.group('subject') == "'''%s'''"%self.page.title():
title = re.search(r"'''(.+?)'''", m.group('subject')).group(1)
if "'''%s'''"%title == m.group('subject'):
return "{{subst:refer"+(
"" if title == self.page.title() else '|'+title
)+{
"is the name of": "|type=name",
"may stand for": "|type=stand",
# TODO add pseudo place detection
"may refer to several places": "|type=place",
"may also refer to":"|type=also",
"may refer to": "",
}.get(m.group("refer"))+"}}"
else:
return m.group()
self.text = re.sub(r"(?P('''[^{|}[\]<\r\n>]+?'''[ ,]*( or )?)+) (?Pmay refer to|may also refer to|may stand for|is the name of):", referTemplate, self.text)
def mosdab_suggestions(self, page_revision):
logname = "/home/dispenser/public_html/logs/mosdab-%s.log" % (self.site.dbName(),)
mosdab_codes = {
'B': 'Bold text on line (MOS:DABENTRY)',
'C': 'Capitalize first letter on line (MOS:DABENTRY)',
'E': 'External link',
'M': 'More than one blue link on a line (MOS:DABENTRY)',
'N': 'Red link with no blue link (MOS:DABRL)',
'O': 'More than one red link on a line (MOS:DABRL)',
'P': 'Punctuation on line (MOS:DABENTRY)',
'R': '<ref> tag',
'S': 'Unpiped #section on line',
'T': 'A link is labeled as to obscure the full title (usually dropping the " (subject)" part) (WP:PIPING)',
'X': 'Excessively long line',
'F': 'Less then two blue links on the page (WP:2DAB)',
'H': 'Huge unbroken list',
'L': 'No unordered list found',
'U': 'No links on line',
}
try:
with open(logname, 'rb') as log:
for line in log:
col = line.split('\t')
if col[0] == bytes(page_revision):
heading(2, "Manual of Style suggestions")
print "This disambiguation page has some automatically generated suggestions for fixes:
"
print '', '\n'.join("%s %s "%(c,mosdab_codes.get(c, "Unknown code %r"%c)) for c in col[2].split()), ' '
break
except IOError:
wikipedia.output("mosdab-%s.log does not exist" % self.site.dbName())
return
def run(self):
if not self.page.title():
wikipedia.output(__doc__)
return
try:
self.text = self.page.get()
except wikipedia.IsRedirectPage:
wikipedia.output("Converting redirect %s" % self.page.aslink())
# Content must end with a template, even {{subst:void}}
self.text = re.sub(ur'(?s).*?(\[\[[^[\]\n]*?\]\]).*', ur'{{subst:refer}}\n\n* \1\n{{subst:long comment}}', self.page.get())
except wikipedia.NoPage:
# XXX Messing with the internals
self.page._contents = ""
self.text = "{{subst:refer}}\n*\n\n{{disambiguation}}"
# Why isn't NoPage a subclass of something else?
try:
self.page.get()
except wikipedia.NoPage:
wikipedia.output(u'Page %s not found ' % self.page.aslink())
return
except Exception as e:
wikipedia.output('Exception: %r' % (e,))
return
wikipedia.output(__doc__)
print '▼ Show details ▼ '
headings.append("Hide details")
# setup
self.getprefixes()
self.template_redirect()
self.doubleredirect()
self.primary_entry()
self.mosfixes()
# generate new links
self.addWiktionary()
self.bluelinks()
self.seealso()
self.redlinks()
self.addtemplatelinks()
self.intro()
## Commonfixes
if self.usecommonfixes:
heading(2, 'Common fixes', className="debug")
import commonfixes
self.text = commonfixes.fix(self.text, page = self.page, verbose = False)
cgitb.enable(logdir='tracebacks')
if len(self.text) > 300:
self.text = self.text.replace('{{subst:long comment}}', '')
if len(self.text) < 200 and '{{subst:long comment}}' not in self.text:
self.text += "\n{{subst:long comment}}"
else:
# Unpipe text
self.text = re.sub(r'\[\[([^{|}[\]]+)\s*\|\s*\1\s*\]\]', r'[[\1]]', self.text)
self.text = re.sub(r'(?i)\[\[([^{|}[\]\n]+)([^{|}[\]\n]+)\|\1\]\]\2', r'[[\1\2]]', self.text)
# run after creating blue links
self.text = re.sub(r'(?<=\[\[)[^{|}[\]\n#]+(?=(?:#[^{|}[\]\n]*|) *(?:\|.*?|)\]\])', skipredirect, self.text)
# music specific fixes
#self.text = re.sub(r"(?uim)^\*([^,\n]*),(['\"]*) *a? *(song|signle) by ([\w [\]]* band |)(?P[\w [\]]+)(? \d+|)(?P album .*?) *$", r"\1\2, a song on \g's\g\g", self.text)
print ''
heading(2, "Entry cleanup")
global prefixes; prefixes = self.prefixes
self.text = re.sub(r'(?m)^[#*]+.*$', parseline, self.text)
print '
'
# Majority vote on bullet style
if self.page.get().count("\n* ") < self.page.get().count("\n*") * 1 / 4:
# Remove spaces
self.text = re.sub(r'(?m)^([*]+) *', r'\1', self.text)
elif self.page.get().count("\n* ") > self.page.get().count("\n*") * 3 / 4:
# Add spaces
self.text = re.sub(r'(?m)^([*]+) *', r'\1 ', self.text)
else:
pass
self.text = re.sub(r"(m)^(?P('''[^{|}[\]\n']+?''',?( or|) *)+) may be an abbreviation for[:;]? *$", r"\g may stand for:", self.text)
heading(2, "Diff")
wikipedia.showDiff(self.page.get(), self.text)
heading(2, "Timeline", className="debug")
print '%s ' % (wikipedia.escape(wikipedia.timereport()),)
heading(2, "Edit box", className="debug")
cursor.execute("/* dabfix LIMIT:30 */SELECT page_latest FROM page WHERE page_namespace=%s AND page_title=%s", (self.page.namespace(), self.page.titleWithoutNamespace(underscore=True),))
if (self.page.revisionid, ) != cursor.fetchone() and self.page.revisionid:
from dab_solver import OutOfSync
print '%s
'%wikipedia.translate(self.site, OutOfSync)
print '''
Remove auto-descriptions
Remove auto sections
'''
self.page.put(self.text, comment="Cleanup per [[WP:MOSDAB]] using [[tools:~dispenser/cgi-bin/dabfix.py|Dabfix]]")
self.mosdab_suggestions(self.page.revisionid)
print ''
print ''
print '
'
try:
cursor.execute('''/* dabfix */
SELECT
pb_title AS "Project",
COUNT(*) AS "Links to here"
FROM pagelinks
JOIN u_dispenser_p.projectbanner ON pb_page = pl_from
WHERE pl_namespace=0 AND pl_title IN (%s)
GROUP BY pb_title
ORDER BY 2 DESC
LIMIT 10
''' % ','.join(('%s',)*(1+len(self.redirects))), tuple([self.page.titleWithoutNamespace(underscore=True)]+self.redirects))
print 'Pages that link here belong to the following wikiprojects
'
print ''
print ''
for tup in cursor.description:
print '%s '%tup[0]
print ' '
for tup in cursor:
print ''
print '\n'.join('%s '%str(item) for item in tup)
print ' '
print '
'
except:
pass
# Floating ToC
print ''
print '
Table of contents '
print '
'
for section in headings:
print '%s ' % ( wikipedia.sectionencode(section), section)
print ' '
def main():
print ''''''
robot = Robot()
robot.run()
if __name__ == "__main__" and wikipedia.handleUrlAndHeader():
try:
wikipedia.startContent(form=True,
head=r"""""")
main()
finally:
wikipedia.endContent()
wikipedia.stopme()