#!/usr/bin/env python # -*- coding: utf-8 -*- """Please read [[WP:Disambiguation dos and don'ts]] before using this tool. """ text_test =""" TODO * Remove duplicate links (including if they redirect to the same place) * detect self links (i.e. warn about unintended circular links) * FIXME spaces removed when removing [[priamary]], [[a]] [[b]] * Add/remove prefixes/suffix, e.g. untether -> tether -> tethering or "Lifetime" -> "A lifetime"/"The lifetime" ------- Test cases: * [[IOS]] * [[Riku]], [[Lulu]], [[Yuna]] * [[Sandy]] (birth dates, suggest prefix index?) * [[Pepe]] - better auto date formatting * [[Dreamweaver (disambiguation)]] - Primary links * [[Ikeda]] * [[Rashomon]] incorrect primary link * Dates: [[Julia]] Acid tests: * [[( ) (disambiguation)]] """ import re, sys, MySQLdb import wikipedia import cgitb; cgitb.enable(logdir='tracebacks') site = None #wikipedia.getSite() headings = [] CatPlaces = r'_places_|_communities_|_constituencies_|_Country,_|_counties$|^Barangays_of_|^Cities_|^Plantations_|^Suburbs_of_|^Towns_|^Townships_in_|^Villages_|^Wards_of_|^Woredas_of_|micropolitan_area$|parishes$|_geography_stubs$' RelinkText = ur"(?mi)^([*#]+[ '\"]*)(%s)([ '\"]*(?:[,\-–—]|''|\"| is | were |$))" from related import dabcatlang def bullets(links, sortkey=None): return '\n'.join("* [[%s]]"%s.replace('_',' ') for s in links) def likeescape(s, escape='\\'): return s.replace('\\','\\\\').replace('_', '\\_').replace('%', '\\%') def EnglishJoin(list): if len(list) <= 1: return ''.join(list) elif len(list) == 2: return ' or '.join(list) else: return ', '.join(list[:-1]) + ', or ' + list[-1] def printu(s): print (s.encode('utf-8') if isinstance(s, unicode) else s) def htmlout(string, data=[]): s = string % tuple(wikipedia.escape("%s"%value) if isinstance(value, (str, unicode)) else value for value in data) if isinstance(s, bytes): print(s) else: print(s.encode('utf-8')) def debug(s): htmlout('
%s
', (s,)) def info(s): printu('
') wikipedia.output("%s" % s) printu('
') def warn(s): printu('
') wikipedia.output("\03{lightorange}WARNING\03{default}: %s" % s) printu('
') def error(s): printu('
') wikipedia.output(u"ERROR: %s" % s) printu('
') def CreateLink(link, title=None, className="", addAttribute=''): if not title: title = link.replace('_', ' ') attributes = ' class="'+className+'"' if className else '' if addAttribute: attributes += ' '+addAttribute return '%s' % ( site.hostname(), site.nice_get_address(wikipedia.urllib.quote((link.encode('utf-8') if isinstance(link, unicode) else link).replace(' ','_'), safe=";@$!*(),/:-_.")), wikipedia.escape(link.replace('_',' ')), attributes, wikipedia.escape(title), ) import time; starttime=time.time() def heading(level, title, style="", className=""): wikipedia.logtime(headings[-1] if headings else '?') global starttime print("\n" % (time.time()-starttime,)) starttime = time.time() htmlout('%s' % (level, wikipedia.sectionencode(title), ' style="%s"'%style if style else '', ' class="%s"'%className if className else '', title, wikipedia.sectionencode(title), level)) #print "" sys.stdout.flush() headings.append(title) def canonicalTitle(title, firstupper=True, underscore=False): """ Converts unicode or bytes string to mw titles support: percent-encoded UTF-8, HTML character references """ # TODO namespace support, e.g. [[WP: Foo]] if isinstance(title, unicode): title = title.encode('utf-8') # Unpercent-encode title = wikipedia.urllib.unquote(title) try: title = unicode(title, 'utf-8') except:title = unicode(title, 'latin-1') # HTML character references title = wikipedia.html2unicode(title) # Remove ltr and rtl markers title = title.replace(u'\u200e', '').replace(u'\u200f', '') # Strip the section part if '#' in title: title = title[:title.index('#')] # Underscore to space and Strip space title = title.replace('_', ' ').strip().lstrip(':') # Merge multiple spaces while ' ' in title: title = title.replace(' ', ' ') # First uppercase if firstupper and title: title = title[0:1].upper() + title[1:] if underscore: title = title.replace(' ', '_') return title def wikilinkregex(t, firstupper=True): t = canonicalTitle(t, firstupper) # Workaround for titles with an escape char if firstupper: t = ur'[%s%s]%s' % (t[0].upper(), t[0].lower(), t[1:],) t = re.escape(t).replace('\\[', '[', 1).replace('\\]', ']', 1) return t.replace('\\ ', '[ _]+').replace('\\|', '|') connections = {} def getConn(dbname, host=None): if not dbname.endswith('_p'): dbname+='_p' if (host,dbname) in connections: try:connections[host,dbname].ping() except:del connections[host,dbname] if (host,dbname) not in connections: connections[host,dbname] = MySQLdb.connect( db=dbname, host=host or dbname.replace('_', '-')+'.rrdb.toolserver.org', #host=host or dbname.replace('_', '-')+'.rrdb.toolserver.org', read_default_file='/home/dispenser/.my.cnf', # WMF's databases varbinary so it'll always be return in UTF-8 byte string # charset Option for wiktionary charset=None if 'wiktionary' in dbname else 'utf8', use_unicode=False ) return connections[host,dbname] #def dropConn(dbname=None): # for key in (dbname,) if dbname else connections: # if key in connections: # del connections[key] html_cache = {} def getParsedText(title): title = canonicalTitle(title, underscore=True) if title not in html_cache: urlname = wikipedia.urllib.quote(title.encode('utf-8'), safe=";@$!*(),/:-_.") html = site.getUrl(site.nice_get_address(urlname)).decode('utf-8') # XXX Vector skin specific html_cache[title] = html[html.index('
'):html.index(""):html.index("")] return html_cache[title] extract_summary_R = re.compile(r'

(?:.*["\',]* (?:is|was|were|are)(?= )|.*?.*?)(\'\'|"|,| \((?:\(.*?\)|[^(\n])*?\)|[\w \t]*\'*.*?\'*| is| was| were| are| or)* *(?P.+?)[,.:;]?

', re.M) def getsummary(title): s = getParsedText(title) info("Download [[%s]] for description" % title) # Avoid section redirects, rd_fragment isn't complete yet if 'redirectToFragment' in s: # redirectToFragment("#Corkscrew_Senton"); info("[[%s]] is a section redirect"%title) return '' if ' id="disambigbox"' in s or ' id="setindexbox"' in s: info("[[%s]] disambiguation page"%title) return '' s = s.replace('
\n', ' ') s = re.sub(r']+class="reference">.*?||||', '', s, flags=re.DOTALL) s = re.sub(r'(?u)[^\S\n]+', ' ', s) # convert emsp s = s.replace('', "''").replace('', "''") m = extract_summary_R.search(s) # FIXME use HTML instead if m: #wikipedia.output("\03{lightsilver}%s\03{default}%s\03{lightsilver}%s\03{default}"%(m.group()[:m.start(2)-m.start()], m.group(2), m.group()[m.end(2)-m.start():],)) s = m.group(2) if len(s) > 250: s = re.sub(ur'(.*?)\.((?:"|\'\'|) +[A-Z].*|$)', r'\1', s) info("Triming string from %d to %d bytes (%+d)"%(len(m.group()), len(s), len(s)-len(m.group()),)) wikipedia.output(wikipedia.unescape("\03{lightsilver}%s\03{default}"%m.group().replace(s, '\03{default}%s\03{lightsilver}'%s))) s = ', %s'%wikipedia.unescape(s).strip() if s.strip(',. '): # FIXME ", American actor" matched ", American actor and musician" print ''%(wikipedia.jsescape(title), wikipedia.jsescape(s),) return s else: error("Unable to get extract from [[%s]]'s HTML" % title) print '
%s
'%(wikipedia.escape(re.sub(r'\s*\n\s*\n\s*','\n\n', s)).encode('utf-8'),) return '' # TODO move to commonfixes replacementset = { # Unicode to ASCII u'−': '-', # minus sign u'–': '-', # en dash u'—': '-', # em dash u'…': '...', # ellipsis u'×': 'x', # times u'“': '"', u'”': '"', # ASCII approximations and substitutions ' -': '-', '- ': '-', '--': '-', '_': ' ', '`': "'", '/': '-', '*': 'x', # language approximations 'ae': 'a', 'ey': 'ei', 'oh': 'o', 'ou': 'o', 'uu': 'u', " 'n": ' an', ' and ': ' & ', ' the ': ' ', ' of ': ' in ', 'k': 'c', } import unicodedata def strip_accents(s): return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')) def strip_variations(s): # returns i = None if ' (' in s: i = s.find(' (') #if ', ' in s: i = s.rfind(', ') s = " %s " % s[:i].lower() s = strip_accents(s) for c1, c2 in replacementset.iteritems(): s=s.replace(c1, c2) for c in ' !"\',-.:;?': s=s.replace(c, '') return s.upper() def skipredirect(m): # TODO consider [[Aude (river)]] the same as [[Aude River]] # TODO add variable for acceptable changes to qualifier [0,1], where 1 allow complete change title = m.group() # `cursor` been globally defined cursor.execute("""/* dabfix.skipredirect LIMIT:30 */ SELECT rd_title, rd_fragment FROM page JOIN redirect ON page_id=rd_from WHERE page_namespace=%s AND page_title=%s AND rd_namespace=page_namespace -- Field is currently not populated -- AND rd_interwiki = "" """, (0, canonicalTitle(title, underscore=True).encode('utf-8'),)) results = cursor.fetchall() if results: target = results[0][0].replace('_', ' ').decode('utf-8') if results[0][1]: # avoid bypassing section redirects return m.group() if any(c in title for c in '(,') and not any(c in target for c in '(,'): # don't lose the qualifier pass info("Found redirect: [[%s]] to [[%s]]" % (title, target, )) elif strip_variations(target)==strip_variations(title): info("Bypassing redirect [[%s]] to [[%s]]" % (title, target, )) if title[0].islower(): return target[0].lower() + target[1:] else: return target else: debug('Comparing %r to %r' % (strip_variations(title), strip_variations(target),)) info("Redirect: [[%s]] to [[%s]]" % (title, target, )) return m.group() def parseline(line): def getYear(s): # 47 BC # 2nd-century BC m = re.match(r'(\d+)(s(?= )|)((?:st|nd|rd|th)-century|)( BC|)', s.replace('_', ' ')) if m: return "%s%s%s%s"%( 'c. ' if m.group(2) else '', m.group(1), m.group(3), " BC" if m.group(4)==" BC" else '' ) return None def yearRange(birth, death, born="born", died="died"): """ Formats birth and death years so "AD" is hidden for the modern era Accepts: 17 century/1860s/c. 1867/1867 [BC|AD] """ def n(tup): return " BC" if tup[1] else "" if tup[0]=="?" or len(tup[0])>=3 else " AD" death = death.partition(' BC') if death else None birth = birth.partition(' BC') if birth else None if birth and not death: return u"%s %s%s" % (born, birth[0], n(birth),) elif birth and death: if birth[1]==death[1]: return u"%s–%s%s" % (birth[0], death[0], n(death),) else: return u"%s BC–%s AD"%(birth[0], death[0],) elif not birth and death: return u"%s %s%s" % (died, death[0], n(death),) else: return u"" def cmpr(a, b): # A a subset of B #printu("Comparing %s to %s
"%(a,b)) if ''.join(re.split(r'[^A-Z0-9]+', b)) == a.upper(): # Initialisms return True else: return strip_variations(a) in strip_variations(b) text = line.group() if '[[' not in text: print '\n
\n' return text else: # Fix formatting of primary link text = re.sub(ur"(?m)^\* *(''|\")\[\[([^{|}[\]\n]+)( \([^{|}[\]\n]+\))\]\]([, ]*)\1", ur"* [[\2\3|\1\2\1\3]]\4", text) # Simple [[MOS:DAB]] corrections text = re.sub(ur"^([^.]*?)(?Please create %s with:

' % CreateLink(t+' (disambiguation)', className="new")) htmlout('
#REDIRECT [[%s]]
', (t.replace('_', ' '),)) return m.group() else: if m.group(2): return m.expand(r'[[\1 (disambiguation)\2]]') else: return m.expand(r'[[\1 (disambiguation)|\1]]') text = re.sub(r'\[\[([^{|}[\]\n]+)(\|.*?|)\]\]( *\(disambiguation\)|)', toDisambiguation, text) # We now build the dictionary `links` telling us what is safe to unlink # This is done marking links which are or have a redirect that is a subset # of the `prefixes` list # links = {} redlink = None primarylink = None titles_R = re.compile(r'(?<=\[\[)[^{|}[\]\n]+?(?=\s*(?:\|.*?|)\]\])') for title in titles_R.findall(text): if not primarylink: primarylink = title cursor.execute(""" /* dabfix.parseline LIMIT:30 */ SELECT page.page_namespace, page.page_title, rd.page_namespace, rd.page_title, rd_fragment, GROUP_CONCAT(IF(pp_value IS NULL, cl_to, NULL) SEPARATOR '|'), (SELECT pp_value FROM page_props WHERE pp_page=IFNULL(rd.page_id, page.page_id) AND pp_propname="displaytitle") FROM page LEFT JOIN redirect ON rd_from = page.page_id LEFT JOIN page AS rd ON rd.page_namespace = rd_namespace AND rd.page_title = rd_title JOIN categorylinks ON cl_from = IFNULL(rd.page_id, page.page_id) LEFT JOIN page AS catpage ON catpage.page_namespace = 14 AND catpage.page_title = cl_to LEFT JOIN page_props ON pp_page = catpage.page_id AND pp_propname = "hiddencat" WHERE page.page_namespace=%s AND page.page_title = %s GROUP BY page.page_title LIMIT 1 """, (0, canonicalTitle(title, underscore=True),)) result = cursor.fetchone() or (None,)*7 # MySQL's default max packet length is 1 KB, truncating the rest # Truncation may occur during a UTF-8 sequence, so we unsafely ignore it result = tuple(s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in result) # Notes: # displaytitle isn't updated after a move, see [[Victory (1996 film)]] d = dict( ns = result[0], title = result[1], rd_ns = result[2], rd_title = result[3], rd_fragment = result[4], displaytitle= result[6], categories = (result[5] or '').split('|'), # derived dabpage = "All_disambiguation_pages" in (result[5] or ''), redirects = [], overlapping = any(cmpr(prefix, title) for prefix in prefixes), # principle should be better defined, it use to be both primary and principle principle = text.find(title) < 15 or text.find(title) < text.find(', ') < len(text)*2//3, ) # get all redirect titles cursor.execute("""/* dabfix LIMIT:30 */ SELECT page_title FROM page JOIN redirect on (page_id=rd_from) WHERE page_namespace=0 AND rd_namespace=0 AND rd_title=%s UNION SELECT rd_title FROM page JOIN redirect on (page_id=rd_from) WHERE page_namespace=0 AND rd_namespace=0 AND page_title=%s """, (canonicalTitle(title, underscore=True),)*2 ) for tup in cursor: (s,) = (s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup) d['redirects'].append(s) d['overlapping'] |= any(cmpr(prefix, s) for prefix in prefixes) # debugging #print "title key: %r
" % (title,) #print 'prefix list: ', prefixes, '
' #print result, '
' #print '%r
' % (d,) links[title] = d if not (title[0:3].islower() and title.find(':', 1)>0): # [[:ja:北原亞以子]] if not redlink and not d['title']: if not title.startswith('Special:'): # [[Special:PrefixIndex/...]] redlink = title if d['displaytitle']: debug("Has DISPLAYTITLE:%(displaytitle)s" %d) # TODO format pattern for ships titleSpec = ( ("No format", r"^(\d+)_(architecture)$", ''), ("Italics", r"^(\d+)_(albums|books|films|live_albums|musicals|novels|operas|plays|soundtracks|television_films|video_games)$", "''"), ("Quote", r"^(\d+)_(songs|singles|short_stories|television_episodes)$", '"'), ) def formatLink(m): # Test cases # [[A (b)|"A" (b)]] DONE # "[[A]]" DONE # "[[A (b)|A]]" # [[A (b)|A (b)]] # [[A (b)|"A" (''b'')]] DONE mark = m.group(4) or m.group(1) title = m.group('title') # key in links dictionary target = canonicalTitle(title, underscore=True) label = "%s%s%s"%(m.group(1), m.group('label') or m.group('title'), m.group(1),) new_label = label if not target: #info('section link') # [[#section (pinball)]] return m.group() elif title not in links: # Program screwed up somewhere error("%s\n\nDoes not contain %r" % (repr(links).replace('{', '{\n').replace(',', ',\n'), title)) # throw error links[title]['rd_title'] elif links[title]['rd_title'] or '#' in title or links[title]['title'] is None: # Skip these as {{DISPLAYTITLE:}} or Category matching # are incorrect. This is probably not necessary # with good title comparer. # TODO [[Category:Redirected_episode_articles]]?, links->categories contains target page if '#' in title: warn("[[%s]] links to a section" % (target,)) elif links[title]['rd_title']: rd_title = links[title]['rd_title'] rd_fragment = links[title]['rd_fragment'] warn("[[%s]] redirects to [[%s]]" % (target, rd_title+('#'+rd_fragment if rd_fragment else ''),)) else: # TODO add separate flag for red links, see above with Special:... warn("[[%s]] is a red link" % (target, )) elif links[title]['displaytitle']: # Use {{DISPLAYTITLE:}} whenever available new_label = wikipedia.html2unicode(links[title]['displaytitle'].replace('', "''").replace('', "''")) #info("{{DISPLAYTITLE:%s}}" % (new_label,)) else: # Otherwise fall back to category match for rulename, pattern, c in titleSpec: for cat in links[title]['categories']: if re.search(pattern, cat): if mark and mark != c: warn("Formatting conflict (%s => %s) with [[Category:%s]]" % (mark, c, cat,)) else: mark = c info("%s rule %s matches [[Category:%s]]" % (rulename, pattern, cat,)) (subject, qualifier) = re.search(r'^(.+?)([ _]*\([^()]+\)|)$', target).groups() new_label = ''.join((mark, subject, mark, qualifier)).replace('_', ' ') #if label != new_label: debug("Automatic label: %s"%new_label) def test(s): return canonicalTitle(re.sub(r"''|'''|\"|]*>", '', s)) # XXX how is [[w (x), y (z)]] handled? (o_subject, o_qualifier) = re.search(r'^(.+?)([ _]*\([^(\n)]+\)|)$', label).groups() (n_subject, n_qualifier) = re.search(r'^(.+?)([ _]*\([^(\n)]+\)|)$', new_label).groups() (t_subject, t_qualifier) = re.search(r'^(.+?)([ _]*\([^(\n)]+\)|)$', target).groups() #info("\ntarget: <%s>\nnew_label: <%s>"%(target,new_label,)) #info("\nt_subject: <%s> \nt_qualifier: <%s> \no_subject: <%s> \no_qualifier: <%s> \nn_subject: <%s> \nn_qualifier: <%s> \n"%(t_subject, t_qualifier, o_subject, o_qualifier, n_subject, n_qualifier, )) if test(o_subject) == test(n_subject): # Copy qualifier styling # XXX Hack to copy extra formatting from displaytitle while keeping original formatting if test(o_qualifier) == test(n_qualifier) and len(n_qualifier) <= len(o_qualifier): n_qualifier = o_qualifier # [[Flash (Chuck)|Flash (''Chuck'']] if links[title]['principle'] and '#' not in title and test(n_subject) == test(t_subject): debug("[[%s]] is the principle link" % title) # If the target qualifier is not the same # e.g. (''Buffy'' episode) => (Buffy: The Vampire Slayer episode) if test(n_qualifier) != test(t_qualifier): n_qualifier = t_qualifier.replace('_', ' ') else: n_qualifier = '' new_label = ''.join((n_subject, n_qualifier)) repl = "[[%s|%s]]"%(title, new_label) # TODO [[lower|"Lower"]] => "[[Lower]]" # TODO avoid capitalizing non-principle links; e.g. [[Pest (organism)|pest]] repl = re.sub(r"\[\[(.*?)\|(''|\")([^{|}[\]\n]+)\2\]\]", r'\2[[\1|\3]]\2', repl, flags=re.I) repl = re.sub(r"\[\[( *(.+?) *)\| *\2 *\]\]", r'[[\1]]', repl, flags=re.I) #TODO rename keys when changing titles return repl else: return m.group() # \g needs to be the same as the dictionary building one text = re.sub(r"(''|\"|)\[\[(?P<title>[^{|}[\]\n]+?)[ |]*(?P<label>(?<=\|)(''|\"|).*?|)\]\],??\1", formatLink, text) if redlink: disambiguationcategory = dabcatlang.get(site.dbName()+'_p', dabcatlang['enwiki_p']) # TODO eliminate the current worked on page from the list cursor.execute(""" /* Trace pages back LIMIT:30 */ SELECT page_namespace, page_title, page_is_redirect, (SELECT IF(cl_to!='All_set_index_articles','(disambig)','(set-index)') FROM categorylinks WHERE cl_from=page_id AND cl_to IN ("""+','.join(("%s",)*len(disambiguationcategory))+""")) AS dab FROM page JOIN pagelinks ON pl_from=page_id WHERE pl_namespace=%s AND pl_title=%s """, disambiguationcategory+(0, canonicalTitle(redlink, underscore=True), )) results = cursor.fetchall() rows = cursor.rowcount # TODO create summary from page with highest concentration of red links if len(results): printu("<p>The following pages link to <b>%s</b></p>" % (CreateLink(redlink, className="new"),) ) print('<ul class="columns">') for tup in results: (ns, title, redirect, dab) = (s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup) printu(u"<li>%s %s</li>" % (CreateLink(wikipedia.namespaces[ns]+':'+title if ns else title), u"(redirect page)" if redirect else dab or u"", )) print("</ul>") else: warn("No pages link to [[%s]]" % (redlink,)) if any(ns for (ns, title, redirect, dab) in results if ns<0): wikipedia.output("Special page") elif sum(1 for (ns, title, redirect, dab) in results if ns==0 and dab is None)==0: # Display information about deletion # # TODO, improved by checking for an AfD subpage # WP:CSD#G11 - Blatant advert; AfD - consensus to delete remove_log_reason_R = re.compile(r'(^|\{\{|\[\[|/wiki/)(Project:|Wikipedia:|WP:|^)(AFD|HOAX|PROD|BLPPROD|Articles[_ ]+for[_ ]+deletion/[^{|}[\]]*|(CSD#|SD#|CSD[ _]+|CSD\]\] |^)(A7|G5|G11))\b', flags=re.I) cursor.execute('''/* dabfix.parseline LIMIT:30 */ SELECT log_comment FROM logging_ts_alternative WHERE log_namespace=%s AND log_title=%s AND log_type="delete" ''', (0, canonicalTitle(redlink, underscore=True),)) m = None for i, (log_comment,) in enumerate(cursor): log_comment = log_comment.decode('utf-8') info("Deletion logs: %(log_comment)s" % locals()) if not m: # and i==0: m = remove_log_reason_R.search(log_comment) if m: text = "" info('Removing [[%s]], matches %r' % (redlink, m.groups())) else: text = "<!--%s -->"%text.rstrip() info('Commenting out red link [[%s]] per [[MOS:DABRL]]: No article links to it' % redlink) #elif 1 < len(results) < 4: # Unlink # text = re.sub(r'\[\[(%s)\|?((?<=\|).*?)\]\]'%wikilinkregex(redlink), lambda m: m.group(2) or m.group(1), text) else: info("%s pages link here"% (rows,)) else: # Per MOS:DAB, we unlink non-relevant links # TODO This should be possibly move above the link removal code overlapping_links = sum(link['overlapping'] for link in links.itervalues()) if overlapping_links >= 1: def f(match): t = match.group(1) if t in ('floruit', 'Floruit', 'fl.',): del links[t] # We pretend it does not exist return match.group() elif t not in links: # [[again]] then [[again]] (already deleted) return match.group(2) or match.group(1) elif links[t]['overlapping']: return match.group() else: # Unlink text del links[t] return match.group(2) or match.group(1) text = re.sub(r'\[\[(?P<title>[^{|}[\]\n]+?)[ |]*(?P<label>(?<=\|).*?|)\]\]', f, text) # we may have removed the first link if primarylink not in links: primarylink = links.keys()[0] else: info("No overlapping link on "+EnglishJoin(["[[%s]]"%key for (key, link) in links.iteritems() if link['overlapping']])) if len(links)==1 and not any(links.values()[0][cat] for cat in ('dabpage', 'rd_title')): # FIXME allow [[title]]'s IATA code m = re.compile(r'''^ ((?:[^'"[\],\n()] |,\ [^[\]|(),\s]*(?=,) |\[\[[^[\]\n]+\]\] |"[^'[\],\n{}]+" |''[^'"[\],\n{}]+'' |" |'' |'(?!') )+) ((?:\ -|[-,:]\ |\s)*) (?:\(+([^(\n)]+)\)+)? ([-.,: ]*) (.*) ''', re.M | re.X).search(text) subject, meta, description = '', '', '' subject = text if m: subject, spacer1, meta, spacer2, description = m.groups() #printu('<div>%s</div>' % ''.join('<samp>%s</samp>'%wikipedia.escape(s) for s in m.groups())) htmlout('<div>%s</div>' % ''.join('<samp title="%s">%%s</samp>'%tip for tip in ("link", "spacer1", "metadata", "spacer2", "description",)), m.groups()) description = "%s%s"%(spacer1 if spacer1.strip() else spacer2 if spacer2.strip() else ", " if description else '', description) else: error('> Unable to separate parts: %s'%text) # Handy category list printu(u'<div class="catlinks">Categories: %s</div>'%' | '.join(CreateLink("Category:"+cat, cat.replace('_', ' ')) for cat in links[primarylink]['categories'])) # FIXME if len(subject)-subject.find(']]') < 5 and 0 < subject.find('[[') < 8 and description == "": if any(re.search(CatPlaces,cat) for cat in links[primarylink]['categories']): wikipedia.output("Geographical places like [[%s]] don't need descriptions" % primarylink) # TODO add section redirect here # TODO add disambiguation check here else: description = getsummary(primarylink) # FIXME # Comics characters introduced in 1977 # 2006 comic debuts # https://en.wikipedia.org/w/index.php?title=Firefly_(disambiguation)&diff=424199591&oldid=424193235 debut = [] birth = None death = None date = [] ### print "%r"%(links,) # category regex for ", a X blah" # FIXME Handle the case when all we have are missing dates debut_R = re.compile(r'^(?P<year>\d{1,4})_(albums|architecture|books|films|live_albums|musicals|novels|operas|plays|poems|short_stories|EPs|songs|singles|soundtracks|television_episodes|television_films|video_games|works|manga|anime|sculptures|paintings)$') descript_R = re.compile(r'^([ ,-]*)(a |an |(?= album | building | book | comic book | film | musical | novel | opera | play | short story | single | song | soundtrack | story | video game | manga | anime | sculpture | painting ))') for cat in links[primarylink]['categories']: # FIXME Less hardcoding if cat.endswith("_births"): birth = getYear(cat) or birth elif cat.endswith("_deaths"): death = getYear(cat) or death elif cat=="Living_people": death = "" elif cat=="Missing_people": death = "" or death elif cat=="Possibly_living_people": death = "" elif cat=="Year_of_death_missing": death = "?" elif cat=="Year_of_birth_missing": birth = "?" elif cat=="Year_of_death_unknown": death = "?" elif cat=="Year_of_birth_unknown": birth = "?" elif debut_R.search(cat): debut.append(debut_R.search(cat).group("year")) #info("> Got year %s from [[Category:%s]]"%(debut, cat,)) else: vague_date = re.search(r'(_|^)(1[6-9]\d\d|20[0-4]\d)(_|$)', cat) if vague_date: date.append(vague_date.group(2)) info('Unused date category: %s' % (cat,)) # Subject (Metadata), Description #if (text.find(primarylink) < 15 or (text.find(primarylink) < text.find(', ') < len(text)*2//3)) and '#' not in primarylink: if links[primarylink]['principle'] and '#' not in primarylink: if birth!=None or death!=None: # People if birth and birth.isdigit() and 1600 < int(birth)<time.gmtime().tm_year - 125: # Mark people older than 125 as death data unknown (1600 is arbitrary) death = death or '?' biotext = yearRange(birth, death, "born", "died") info('Date of Birth/Death: %s'%biotext) if biotext and biotext not in primarylink: if not meta: meta = biotext else: meta = re.sub(ur''' ( ((born|b|died|d)[. ]*|) ((c|ca|circa|about|aft|after|before)[. ]*|) (([0-3][0-9][ ]|)(Jan|Feb|Mar|May|Apr|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[abceghilmnoprstuvy.]*([ ][0-3][0-9]|)[, ]*|) [\d?]+ ((?:st|nd|rd|th)-century|) ([ ]*(CE|BCE|BC|AD)|) ([ ]*([-–—]|&\w+;|\ to\ )[ ]*|) ){1,2}''', biotext, meta, 1, flags=re.I | re.X) if biotext not in meta: meta = "%s, %s"%(meta, biotext,) description = description.replace(biotext, '').replace(' ()', '') # remove lifespans # Do NOT include these per [[MOS:DAB#People]] description = re.sub(r'(?iu)^([ ,-]*)(a |an |the |)', r'\1', description) elif debut: if not re.search(r'\b\d{3,4}\b', subject+description): # Use the most frequent year description = descript_R.sub(r'\1a %s '%max(set(debut), key=debut.count), description, 1) if not any(year in description for year in debut): info("> Unused year: %s"%EnglishJoin(debut)) else: pass text = subject.strip() if meta: text += " (%s)"%meta.strip() if description: text += description.lstrip() text = text.strip(', ') if text != line.group().strip(): wikipedia.output(u"\03{lightred}%s\03{default}"%line.group()) while ' ' in text: text = text.replace(' ', ' ').rstrip() wikipedia.output(u"\03{lightgreen}%s\03{default}"%text) print '\n<hr/>\n' return text else: wikipedia.output(u"\03{lightgreen}%s\03{default}"%text) print '\n<hr/>\n' return line.group() class Robot(object): def __init__(self): self.page = wikipedia.MyPage self.site = self.page.site() try: self.cursor = getConn(self.site.dbName()).cursor() except: self.cursor = getConn(self.site.dbName(), host="sql-s1-user").cursor() global site; site = self.site global cursor; cursor = self.cursor self.redirects = [] self.prefixes = set() self.text = '' self.page_id = 0 self.preview = False self.usecommonfixes = wikipedia.SysArgs.get("commonsfixes")!='no'# '#' in self.text self.enable_wiktionary = wikipedia.SysArgs.get("wiktionary")!='no' self.disambiguationcategory = dabcatlang.get(self.site.dbName()+'_p', dabcatlang['enwiki_p']) # Not implemented self.summaryflags = {} def setsummary(self, flag, performed_on): if flag not in self.summaryflags: self.summaryflags[flag] = [] self.summaryflags[flag].append(performed_on) def __repr__(self): return 'dabfix.py '+' '.join(tuple("-%s:%s" % t for t in wikipedia.SysArgs.items())) def addsection(self, sectionname, new_text): repl = "\n== %s ==\n%s\n" if re.search(r'(?m)^==[^=]+==$', self.text) else "\n=== %s ===\n%s\n" if new_text: debug('Adding %r section (%d lines)' % (sectionname, new_text.count('\n')+1,)) # Place before the last template # Or the last empty section (avoid == References ==\n {{reflist}}) self.text, count = re.subn( r'(?s)(?=(\n(=+[^\n]+=+\s*|)\{\{[^{}]+\}\}\s*)+[^{]*?$)', repl%(sectionname, new_text), self.text, 1) if count==0: self.text += repl%(sectionname, new_text) # Add to JS list for removing print '<script type="text/javascript">AddedSection(%s, /%s/);</script>' % ( wikipedia.jsescape(sectionname), r"[\r\n]+(=+) %s \1[\r\n]+((?![{=}]).*[\r\n]*)*"%re.escape(sectionname), ) def addprefix(self, t): iEnd=t.find('_(') if iEnd==-1: iEnd=None self.prefixes.add(t[:iEnd]) def reconnect(self, reason=None): # MySQL drop the connect, we'll need to reconnect # FIXME this is a hack, it should be handled by a connection routine that also flushes the cursor global cursor; cursor = getConn(self.site.dbName()).cursor() def getprefixes(self): print '<div class="debug">' heading(2, "Redirects") self.addprefix(self.page.title(underscore=True)) cursor.execute("SELECT page_title FROM page JOIN redirect ON page_id=rd_from WHERE page_namespace=0 and rd_namespace=%s and rd_title=%s", (0, self.page.titleWithoutNamespace(underscore=True),)) rows = cursor.rowcount if rows: print '<ul class="columns">' if rows > 6 else '<ul>' for (redirect,) in cursor: self.redirects.append(redirect) print "<li>%s</li>"%CreateLink(redirect) self.addprefix(redirect.decode('utf-8')) print "</ul>" else: print '<p>There are no redirects</p>' info('Prefixes used for matching: %s' % (EnglishJoin(sorted(self.prefixes)),)) print '</div>' # Pre-materialize sub-query for use in Blue and Red link recovery cursor.execute("""/* dabfix.getprefixes LIMIT:30 */ /* List of links on the page */ SELECT IFNULL(rd_title, pl_title) FROM page JOIN pagelinks ON pl_from = page.page_id LEFT JOIN page AS rd ON rd.page_namespace = pl_namespace AND rd.page_title=pl_title LEFT JOIN redirect ON rd_from = rd.page_id AND rd_namespace = 0 WHERE page.page_namespace = %s AND page.page_title = %s AND pl_namespace=0 UNION SELECT %s """, (self.page.namespace(),)+(self.page.title(underscore=True),)*2) self.existingLinks = cursor.fetchall() self.median = 0 try: cursor.execute("""/* dabfix.getprefixes LIMIT:30 NM */ SELECT COUNT(*) AS FREQ FROM page AS dab JOIN pagelinks AS p ON p.pl_from = dab.page_id JOIN pagelinks AS s ON s.pl_namespace=p.pl_namespace AND s.pl_title=p.pl_title JOIN page AS blue ON blue.page_namespace=p.pl_namespace AND blue.page_title=p.pl_title WHERE dab.page_namespace=%s AND dab.page_title=%s AND p.pl_namespace=%s GROUP BY blue.page_namespace, blue.page_title -- GROUP BY p.pl_namespace, p.pl_title ORDER BY FREQ; """, (self.page.namespace(), self.page.title(underscore=True), 0,)) results = cursor.fetchall() if results: self.median, = results[len(results)//2] debug('The median linktivity is %d (sample %d links)'%(self.median, len(results),)) except Exception, e: self.reconnect() self.median = 20 warn('Unable to determine median linktivity (%s), assuming %d'%(e, self.median)) def getdefinitions(self): titles_to_look_for = [] # Build a list of title permutation from prefixes list for title in self.prefixes: title_lcfirst = title[0:1].lower()+title[1:] titles_to_look_for += [ # Include first uppercase and first lowercase variants title, title_lcfirst, # e.g. [[wikt:-san]] '-'+title, '-'+title_lcfirst, # e.g. [[wikt:emo-]] title+'-', title_lcfirst+'-', ] # Get existing wiktionary links (XXX:Feb2012:{{Sec link auto}} broken it) # FIXME page_id is always 0 try: self.cursor.execute( "/*LIMIT:30 NM*/SELECT iwl_title FROM iwlinks WHERE iwl_prefix IN ('wikt', 'wiktionary') AND iwl_from=%s", (self.page_id,) ) except MySQLdb.OperationalError as (errno, strerror): if errno not in (1317, 2006): raise except Exception as x: # mystery error raise BaseException(self.page_id, self.page.title(), repr(x)) else: for iwl_title in self.cursor: titles_to_look_for += iwl_title.decode('utf-8') try: query_start = time.time() conn = getConn("%swiktionary"%self.site.language()) wikt_curs = conn.cursor() # XXX Older servers masquerade utf-8 as latin-1 varchar wikt_curs.execute("DESCRIBE page page_title") if 'varchar(255)' in wikt_curs.fetchall()[0]: wikt_curs.execute("SET NAMES 'latin1'") wikt_curs.execute(""" /* dabfix.getdefinitions() LIMIT:90 */ -- SELECT DISTINCT derived.page_title FROM ( ( SELECT DISTINCT page.page_title, page.page_len FROM page WHERE page.page_namespace=0 AND page.page_title IN ("""+ ','.join(('%s',)*len(titles_to_look_for))+""") ) UNION DISTINCT ( SELECT DISTINCT page.page_title, page.page_len FROM page JOIN redirect ON rd_namespace=page.page_namespace AND rd_title=page.page_title JOIN page AS rd ON rd.page_id=rd_from WHERE page.page_namespace=0 AND rd.page_namespace=0 AND rd.page_title IN ("""+ ','.join(('%s',)*len(titles_to_look_for))+""") ) ORDER BY page_len DESC -- ) AS derived; """, tuple(s.encode('utf-8') for s in titles_to_look_for)*2) except MySQLdb.OperationalError as (errno, strerror): if errno == 1317: # 'Query execution was interrupted' error("Wiktionary database did not response in time (%d seconds)" % (time.time()-query_start,)) else: error("Wiktionary OperationalError (%d, %s)" % (errno, strerror)) self.addsection('Wiktionary', "* FAILED %s" % strerror) return # Abort #finally: # with open('./generation_stats/getdefinitions', 'ab') as f: # f.write('%d\n' % (time.time()-query_start,)) #print '<!--', titles_to_look_for, '-->' wikipedia.logtime("Got Wiktionary links") # MySQL bug prevents DISTINCT and ORDER BY used together definitions = [page_title.decode('utf-8') for (page_title, page_len,) in wikt_curs] conn.close() return definitions def addWiktionary(self): # Doesn't work with mutiple boxes max_definitions = 5 if not self.enable_wiktionary: return heading(2, 'Wiktionary links', className="debug") definitions = self.getdefinitions() print '<div class="debug">' if definitions: wiktionarylinks = "{{wiktionary|%s}}" % '|'.join(sorted(definitions[0:max_definitions])).replace('_', ' ') del definitions[0:max_definitions] htmlout("Definitions box: <code>%s</code>", (wiktionarylinks,)) if definitions: wikipedia.output("Excluding %s since {{Wiktionary}} is limit to %s definitions" % ( EnglishJoin(["[[wikt:%s]]"%page_title for page_title in definitions]), max_definitions, )) else: wiktionarylinks = "" wikipedia.output("No definitions from wiktionary") self.text = re.sub(r'(?i)(\{\{Infobox(?:[^{|}]|\{\{[^{}]+?\}\}|\|(?!\s*wikt))*?)(?:\s*\|\s*wikt\s*=|)(\s*)\}\}', r'\1\2| wikt = {{wiktionary}}\2}}', self.text) m = re.search(r'(?sx)\{\{([Ww]iktionary)(\s*\|.*?|)\}\}[ ]*', self.text) if m: self.text = m.re.sub(wiktionarylinks, self.text) elif not wiktionarylinks: pass else: self.text = wiktionarylinks + '\n' + self.text print '</div>' def doubleredirect(self): # XXX what this suppose to do? cursor.execute(""" /* dabfix.doubleredirect() LIMIT:30 */ SELECT link.page_title, targt.page_title, dbl.rd_title FROM page AS dab JOIN pagelinks ON pl_from = dab.page_id JOIN page AS link ON link.page_namespace=pl_namespace AND link.page_title=pl_title JOIN redirect ON redirect.rd_from = link.page_id JOIN page AS targt ON targt.page_namespace=redirect.rd_namespace AND targt.page_title=redirect.rd_title JOIN redirect AS dbl ON dbl.rd_from = targt.page_id WHERE dab.page_namespace = 0 AND dab.page_title = %s AND targt.page_is_redirect = 1 """, (self.page.title(underscore=True),)) results = cursor.fetchall() if results: heading(3, "Double redirects") print '<ul class="error">' for t in results: print "<li>%s → %s → %s</li>"%tuple((CreateLink(title) for title in t)) print '</ul>' def primary_entry(self): # TODO Determine which topics go where on multi-topic primary pages hatnotes = ("Hatnote",) primlink = {} cursor.execute("""/* dabfix.primary_entry LIMIT:30 */ SELECT GROUP_CONCAT(DISTINCT rdpagein.page_title SEPARATOR '|'), pagein.page_title, dab.page_title, pageout.page_title, rdpageout.page_title, EXISTS (SELECT 1 FROM templatelinks WHERE tl_from = pagein.page_id AND tl_namespace = 10 AND tl_title IN ("""+','.join(("%s",)*len(hatnotes))+""") ) AS hatnote FROM page AS dab JOIN pagelinks AS linkin ON linkin.pl_title = dab.page_title AND linkin.pl_namespace = 0 JOIN page AS pagein ON pagein.page_id = linkin.pl_from AND pagein.page_namespace = 0 LEFT JOIN redirect AS rdin ON rdin.rd_title = pagein.page_title AND rdin.rd_namespace = 0 LEFT JOIN page AS rdpagein ON rdpagein.page_id = rdin.rd_from AND rdpagein.page_namespace = 0 JOIN pagelinks AS linkout ON linkout.pl_from = dab.page_id AND linkout.pl_namespace = 0 JOIN page AS pageout ON pageout.page_title = linkout.pl_title AND pageout.page_namespace = 0 LEFT JOIN redirect AS rdout ON rdout.rd_from = pageout.page_id AND rdout.rd_namespace = 0 LEFT JOIN page AS rdpageout ON rdpageout.page_title = rdout.rd_title AND rdpageout.page_namespace = 0 WHERE dab.page_namespace = %s AND dab.page_title = %s AND ( pagein.page_id = pageout.page_id OR pagein.page_id = rdpageout.page_id OR rdpagein.page_id = pageout.page_id OR rdpagein.page_id = rdpageout.page_id ) GROUP BY pagein.page_title LIMIT 25 /* should be enough */; """, hatnotes+(self.page.namespace(), self.page.title(underscore=True),)) rows = cursor.rowcount def checkTitle(a, b): # does a match b if a == b: return True # elif a.find(b+'_(')==0: # return True # elif a.find(b+',')==0: # return True else: return False oldset = self.prefixes.copy() if rows: heading(2, "Primary topic", className="debug") debug('Analyzing circular links') for tup in cursor: (inrd, inpage, dabpage, outpage, outrd, hatnote) = tuple(s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup) print '<!--' debug(u' → '.join('[[%s]]'%s.replace('_', ' ') for s in (inrd, inpage, dabpage, outpage, outrd,) if s)) print '-->' # XXX What are we doing here? if any(checkTitle(inpage, prefix) or inrd and any(checkTitle(s, prefix) for s in inrd.split('|')) for prefix in self.prefixes): if hatnote: # is {{dablink}} on the page? wikipedia.output("[[%s]] should be listed as a primary topic"%inpage.replace('_', ' ')) primlink[inpage] = True # if inrd: for x in inrd.split('|'): self.addprefix(x) else: self.addprefix(inpage) else: warn("Missing hatnote on [[%s]]"%inpage) newprefixes = sorted(self.prefixes.difference(oldset)) if newprefixes: info("Adding prefixes: %s" % EnglishJoin(newprefixes)) for link in primlink: if re.search(r"'''\[\[%s\]\].*" % wikilinkregex(link), self.text): wikipedia.output('[[%s]] is already bolded' % link.replace('_', ' ')) else: self.text = re.sub(r"(?sm)(.*)^[#*]+ *(\[\[%s\]\])[, ]*([^\n]*)\n?" % wikilinkregex(link), r"'''\2''' is \3.\n\n\1", self.text) def bluelinks(self): def parameters_links(prefixes): pf_list = () for prefix in prefixes: prefix_esc = likeescape(prefix) pf_list += (prefix_esc+'\\_(%)', prefix_esc+',\\_%', prefix_esc+'\\_(%),\\_%',) return tuple(pf_list) def parameters_names(prefixes): list = () for prefix in prefixes: prefix_esc = likeescape(prefix) # Acronym if prefix.isupper() and prefix.isalpha(): # No spaces/digits/_ = Initials list += ('%\\_'.join(prefix)+'%', prefix+'\\_%',) elif prefix.count('_') > 2 or re.search(ur'[\W\d]', prefix.decode('utf-8') if isinstance(prefix, bytes) else prefix, flags=re.U): # Multiple spaces or [0-9], puncuation, symbols list += ("", "", ) # Two names elif prefix.count('_') == 1: # First_Last # TODO X_FIRST_LAST # FIXME X_(Y) does not work list += (prefix.replace('_', r'\_%\_'), prefix.replace('_', r'%\_')+r'\_(%)', ) elif prefix.count('_') == 2: list += (prefix.replace('.', '').replace('_', r'\_%\_'), prefix.replace('_', r'%\_')+r'\_(%)', ) else: # First or Last list += (prefix_esc+'\\_%', "%\\_"+prefix_esc,) return tuple(list) def run_query(title_search=(), name_search=("",), timeout=90): cursor.execute("/* dabfix.bluelinks SLOW_OK LIMIT:"+str(timeout)+""" NM */ SELECT page_namespace, page_title, rd_namespace, rd_title, /* TODO query might be faster with join rather then subqueries */ (SELECT COUNT(*) FROM pagelinks JOIN categorylinks ON cl_from=pl_from WHERE pl_namespace=page_namespace AND pl_title=page_title AND cl_to IN ("""+','.join(('%s',)*len(self.disambiguationcategory))+""")) AS "dabcount", backlinks, /* XXX or maybe categories are better */ EXISTS (SELECT 1 FROM categorylinks WHERE cl_to REGEXP "Living_people|.*_births$|.*_deaths$" AND cl_from = page_id ) AS Person, EXISTS (SELECT 1 FROM categorylinks WHERE cl_to REGEXP %s AND cl_from = page_id ) AS Geography, IFNULL(pp_value, page_title) AS Sortkey FROM ( SELECT page_id, page_namespace, page_title, rd_namespace, rd_title, IFNULL(rd_title, page_title) AS target, COUNT(pl_from) AS "backlinks", pp_value FROM (SELECT page_id, page_namespace, page_title FROM page WHERE page_namespace=0 AND ( """ +' OR '.join(('page_title=%s',)*len(self.prefixes)) +'\n OR \n' +' OR '.join(('page_title LIKE %s',)*len(title_search)) +"""\n OR \n((""" +' OR '.join(('page_title LIKE %s',)*len(name_search)) +""") AND CAST(page_title AS CHAR CHARACTER SET utf8) REGEXP "^[-\'`.[:alpha:]]+(_[[:upper:]][-\'`.[:alpha:]]*)?_[[:upper:]][-\'`[:alpha:]]+$" ) )) AS searched_pages LEFT JOIN pagelinks ON pl_namespace=0 AND pl_title=page_title LEFT JOIN redirect ON rd_from=page_id AND rd_namespace=0 LEFT JOIN page_props ON pp_page=page_id AND pp_propname="defaultsort" GROUP BY page_id ORDER BY /* Parentheses terms first */ INSTR(target, '_(') OR INSTR(target, ',') DESC, target, rd_title IS NOT NULL ASC, backlinks DESC LIMIT 25000 ) AS r GROUP BY target HAVING target NOT IN ("""+','.join(('%s',)*len(self.existingLinks))+""") ORDER BY INSTR(page_title, '_(') AND rd_title IS NULL DESC, FLOOR(LOG10(backlinks)) DESC, Person DESC, Geography DESC, Sortkey ASC LIMIT 500 """, self.disambiguationcategory+(CatPlaces,)+tuple(self.prefixes)+title_search+name_search+zip(*self.existingLinks).pop()) try: query_start = time.time() run_query(title_search=parameters_links(self.prefixes), name_search=parameters_names(self.prefixes)) except MySQLdb.OperationalError as (errno, strerror): self.reconnect() if errno == 1317: # 'Query execution was interrupted' error("Blue link search: Timed out (%d seconds)"% (time.time()-query_start,)) wikipedia.logtime("Blue link search: Retry with simpler query") try: run_query(title_search=parameters_links(self.prefixes), name_search=tuple(likeescape(prefix)+"\\_%" for prefix in self.prefixes), timeout=240) except MySQLdb.OperationalError as (errno, strerror): error("Blue link search (simple): Timed out (%d seconds)"% (time.time()-query_start,)) wikipedia.logtime("Blue link search (simple): Timed out") self.addsection('Blue link recovery', "* FAILED %s" % strerror) return else: raise #finally: # with open('./generation_stats/getbluelinks', 'a') as f: # f.write('%d\n' % (time.time()-query_start,)) addlinks = dict( blue = {}, names = {}, places = {}, people = {}, geo = {}, ) rows = cursor.rowcount if rows: heading(2, 'Blue link recovery', className="debug") for tup in cursor: tup = tuple(s.decode('utf-8') if isinstance(s, bytes) else s for s in tup) (page_ns, page, rd_ns, rd_target, dabcount, count, people, geography, sortkey) = tup # Re-link removed links self.text,success = re.subn(RelinkText % wikilinkregex(tup[1]), r'\1[[\2]]\3', self.text, 2) if success: continue # otherwise if people: listkey = 'people' elif geography: listkey = 'geo' elif '(' in page or '-' in page: listkey = 'blue' elif ',' in page: listkey = 'places' else: listkey = 'names' key = rd_target or page addlinks[listkey][key] = (page, dabcount, count, people, geography, sortkey) extra = set([]) for (listkey, listname, maxsize) in ( ('blue', "blue links", 25,), ('people', "People", 40,), ('geo', "Places", 40,), ('names', "names", 20,), ('places', "place-like names", 15,), ): addlist = addlinks[listkey] if not addlist: continue # TODO custom sort key for People # XXX better if sorted by link count (c) # addsort = sorted(addlist, key=lamdba tup: tup[2], reverse=True) most = sorted((t for k,(t,d,c,p,g,s) in addlist.iteritems()), key=lambda t:addlist.get(t, (t,))[-1]) good = sorted((t for k,(t,d,c,p,g,s) in addlist.iteritems() if k==t and d==0), key=lambda k:addlist[k][5]) top = sorted((t for k,(t,d,c,p,g,s) in addlist.iteritems() if k==t and c>=self.median and d==0), key=lambda k:addlist[k][5]) debug(u"%s pages found: %d [%d redirects/dab-linked, %d links ≥ median (%d)]"\ %(listname.capitalize(), len(most), len(addlist)-len(good), len(top), self.median,)) extra |= set(addlist.keys()) if 0 < len(most) <= maxsize // 2: self.addsection('%d recovered %s%s'%(len(most), listname, ' (All redirects)' if len(good)==0 else ''), bullets(most)) extra -= set(addlist.keys()) extra -= set(most) elif 0 < len(good) <= maxsize: self.addsection('recovered %s (%d non-redirects)'%(listname, len(good)), bullets(good)) extra -= set(good) elif 0 < len(top) <= maxsize: self.addsection('recovered %s (%d high value)'%(listname, len(top)), bullets(top)) extra -= set(top) else: wikipedia.output("Too many %s to add (%d/%d)" % (listname, len(addlist), maxsize)) if len(extra): info("The following titles were NOT included because they are redirects, linked from another disambiguation page, or were linked less than the median on this page.") print('<ul class="columns">' if rows > 6 else '<ul>') for key in sorted(extra): for listkey in addlinks: if key in addlinks[listkey]: (title, dabcount, count, people, geography, sortkey) = addlinks[listkey][key] x = [] if title!=key: x.append("redirects to %s" % CreateLink(key)) if dabcount: x.append("linked from %d disambiguation pages" % dabcount) if count != dabcount and dabcount > 0: x.append("%d links" % count) printu('<li>%s%s</li>'%(CreateLink(title, className='mw-redirect' if title!=key else ''), ' (%s)'%EnglishJoin(x) if x else '',)) print('</ul>') def redlinks(self): addlinks = {} addnames = {} addmissing = {} addmisname = {} addtemplatelinks = {} missing_P = r'missing|encyclopedia|redlinks|wikiproject(?!.*Red_Link_Recovery|.*COIReports)' missing_R = re.compile(missing_P, re.I) red_params = 'pl_title LIKE %s OR pl_title LIKE %s OR pl_title LIKE %s OR ((pl_title LIKE %s) AND CAST(pl_title AS CHAR CHARACTER SET utf8) REGEXP "^[-\'`.[:alpha:]]+(_[[:upper:]][-\'`.[:alpha:]]*)?_[[:upper:]][-\'`[:alpha:]]+$")' title_search = () name_search = () #''' # TODO port over to dab solver # https://toolserver.org/~dispenser/cgi-bin/dab_solver.py/Baller_Blockin'_(film) # https://toolserver.org/~dispenser/cgi-bin/dabfix.py/Manuel_Vazquez # Missing Manuel Vázquez (accient on a from redirect) for prefix in self.prefixes: # title_search += ( # begins likeescape(prefix)+'\\_(%)', likeescape(prefix)+',\\_%', likeescape(prefix)+':\\_%', likeescape(prefix)+'\\_(%),\\_%', # trails '%:\_'+likeescape(prefix), ) # if prefix.count('_') > 1 or not re.search(ur'[\W\d]', prefix, flags=re.U): name_search += ( likeescape(prefix)+'\\_%', "%\\_"+likeescape(prefix), ) # Two names elif prefix.count('_') == 1: # First_Last # TODO X_FIRST_LAST # #FIXME X_(Y) does not work name_search += ( likeescape(prefix).replace('\\_', '\\_%\\_'), likeescape(prefix).replace('\\_', '\\_%')+"\\_(%)", ) elif prefix.isupper() and prefix.isalpha(): # Initials + no underscore name_search += ( '%\\_'.join(prefix)+'%', likeescape(prefix)+'\\_%', ) # XXX what's the last test for? numbers and symbols? does it match the unicode chars? # def redlink_fulltext_searcher(dbname, namespace, pfx_list): if namespace!=0 or not dbname.startswith('enwiki'): return () try: cursor.execute(' UNION '.join(("""( /* related.redlink_fulltext_searcher LIMIT:1 */ SELECT REPLACE(rls_title_ft, ' ', '_') FROM u_dispenser_p.redlinks_enwiki WHERE MATCH (rls_title_ft) AGAINST (%s IN NATURAL LANGUAGE MODE) /* ORDER BY is implicit */ LIMIT 8 )""",)*len(pfx_list)), tuple(p.replace('_', ' ') for p in pfx_list)) return zip(*cursor.fetchall()).pop() except Exception as e: self.reconnect() error("Redlink FullText search: %r"%(e,)) return () # print repr(prefixes) print '<div class="debug">' print '<br/>'.join(redlink_fulltext_searcher(self.site.dbName(), 0, self.prefixes)) print '</div>' # print title_search # print '<br/>' # print name_search # print '<br/>' #''' def redLinkParameters(prefixes): list = () for p in prefixes: list += (likeescape(p)+'\\_(%)', likeescape(p)+',\\_%', likeescape(p)+'\\_(%),\\_%', likeescape(p)+'\\_%',)# '%\\_'+likeescape(p),) return list try: query_start = time.time() cursor.execute(""" /* dabfix.redlinks() LIMIT:90 */ SELECT pl_namespace, pl_title, ns_name, /* text for pl_namespace above */ COUNT(*) AS link_count, SUM(ref.page_namespace = pl_namespace) AS ns_links, (SELECT GROUP_CONCAT(DISTINCT DATE_FORMAT(log_timestamp, "%%b %%Y") SEPARATOR ", ") FROM logging_ts_alternative WHERE log_namespace = pl_namespace AND log_title = pl_title AND log_action = "delete" ) AS log_deletes, GROUP_CONCAT(ref.page_namespace SEPARATOR "|") AS ns_context, GROUP_CONCAT(ref.page_title SEPARATOR "|") AS context, SUM((SELECT STRAIGHT_JOIN COUNT(*) FROM templatelinks /* MySQL optimizer on some TS database configurations, likely due to bad * statistics. See TS-1190, workaround using STRAIGHT_JOIN */ JOIN page AS trans ON trans.page_id=tl_from AND trans.page_namespace=0 WHERE tl_namespace=ref.page_namespace AND tl_title=ref.page_title )) AS trans_count -- , SUM(ref.page_title REGEXP "missing|encyclopedia|redlinks") FROM page AS ref JOIN pagelinks ON pl_from = ref.page_id LEFT JOIN categorylinks ON cl_from = ref.page_id AND cl_to IN ("""+','.join(('%s',)*len(self.disambiguationcategory))+""") JOIN toolserver.namespace ON dbname = (SELECT DATABASE()) AND ns_id = pl_namespace LEFT JOIN page AS pl ON pl.page_namespace=pl_namespace AND pl.page_title=pl_title WHERE pl.page_id IS NULL AND pl_title NOT IN ("""+','.join(("%s",)*len(self.existingLinks))+""") AND ("""+' OR '.join((red_params,)*len(self.prefixes))+""") AND pl_namespace = 0 /* Content namespaces only */ AND ref.page_namespace IN (0, 2, 4, 6, 8, 10, 12, 14) /* No disambiguation pages (also hack to correct ns_links) */ AND cl_to IS NULL GROUP BY pl_namespace, pl_title -- HAVING log_deletes IS NULL ORDER BY /* Parentheses terms first */ INSTR(pl_title,'_(') OR INSTR(pl_title,',') DESC, /* Article backlink count in graduations */ FLOOR(LOG2(SUM(ref.page_namespace=0))) DESC, /* Put deleted items at the bottom */ log_deletes IS NOT NULL, -- /* Case-insensitive alphabetize */ -- pl_title_ci ASC pl_title ASC """, self.disambiguationcategory+zip(*self.existingLinks).pop()+redLinkParameters(self.prefixes)) except MySQLdb.OperationalError as (errno, strerror): self.reconnect() if errno == 1317: # 'Query execution was interrupted' error("Red link search timed out (%d seconds)"% (time.time()-query_start,)) self.addsection('Red links', "* FAILED %s" % strerror) return # Abort else: raise #finally: # with open('./generation_stats/getredlinks', 'a') as f: # f.write('%d\n' % (time.time()-query_start,)) rows = cursor.rowcount if rows: heading(2, "Red link recovery", className="debug") print('<ul class="columns">' if rows >= 6 else '<ul>') for tup in cursor: # FIXME UTF-8 strings are sometimes cut short (pl_namespace, pl_title, ns_name, links, ns_links, log_deletes, ns_context, context, trans_count) = tuple(s.decode('utf-8', errors="ignore") if isinstance(s, bytes) else s for s in tup) # will we add the link? # TODO create blacklist for Wikipedia:Templates with red links/xxx # re.search(r'(?mi)^\*+[ \'"]*(%s)(\b|[,|\-])' % wikilinkregex(pl_title.replace('_', ' ')), self.text) intext = re.search(RelinkText % wikilinkregex(pl_title), self.text) missing = missing_R.search(context) if log_deletes and ns_links==0: # FIXME Combine with better code from other places to classify deleted links continue elif log_deletes: extra_info_html = ', <b>deleted %s</b>' % log_deletes elif intext: extra_info_html = '<b>(Linking text)</b>' self.text, success = intext.re.subn(r'\1[[\2]]\3', self.text, 2) elif missing: extra_info_html = '<b>(Missing article)</b>' if any(c in pl_title for c in (',_','_(','-',)) or '_' not in pl_title: addmissing[pl_title] = True else: addmisname[pl_title] = True elif ns_links>=2 and links>=4: if '10' in ns_context.split('|'): extra_info_html = '<b>(%d transclusions)</b>' % (trans_count,) try: addtemplatelinks[pl_title] = context.split('|')[ns_context.split('|').index('10')] except IndexError: addtemplatelinks[pl_title] = '' else: extra_info_html = '<b>(Recover)</b>' if any(c in pl_title for c in (',_','_(','-',)) or '_' not in pl_title: addlinks[pl_title] = True else: addnames[pl_title] = True else: extra_info_html = '' # Render HTML list print '<li class="keeptogether %s"><span>' % ('' if (ns_links-trans_count>=2 and links-trans_count>=4 or links>=rows//5) or intext or missing else 'debug', ) printu(CreateLink(pl_title, className="new")) if ns_context != '0': # Avoid "1 article link" printu(' (%s)'%CreateLink("Special:WhatLinksHere/%s"%pl_title, "%s%d %s%s"%( '%d article link%s / '%(ns_links-trans_count if ns_links>=trans_count else ns_links, '' if ns_links==1 else 's') if links > ns_links > 0 else '', links, 'article link' if links==ns_links else 'link', '' if links==1 else 's', ), addAttribute=' onclick="toggleNode(this.parentNode.nextSibling);return false;"', className="rl_expand") ) print extra_info_html try: if ns_context != '0': printu('</span><ul class="whatlinkshere"%s><li>%s</li></ul>'% ( ' style="display:none"' if ns_context != '0' else '', '</li>\n<li>'.join(CreateLink(t if n=='0' else (wikipedia.namespaces[int(n)] or '')+':'+t) for n,t in zip(ns_context.split('|'), context.split('|'),)), )) else: printu('</span> from %s'% (CreateLink(context),)) except Exception as e: # FIXME specific exception printu('</span>') error('excessed max packet size (%r)' % e) print '</li>' if rows: print '</ul>' for (addlist, listname, maxsize) in ( (addlinks, "red links", 25), (addnames, "red link names", 10), (addmissing,"missing articles", 10), (addmisname,"missing names", 10), ): if not addlist: continue elif len(addlist) <= maxsize: self.addsection('%s' % listname, bullets(addlist.keys())) else: wikipedia.output("Too many %s to add (%d/%d)" % (listname, len(addlist), maxsize)) if addtemplatelinks: if len(addtemplatelinks)<=5 and addnames and addlinks: self.addsection('templated red links', ''.join(("* [[%s]], [[Template:%s|]]\n"%(link, template)).replace('_',' ') for link, template in addtemplatelinks.items()) ) else: wikipedia.output("Not adding %d templated red links" % len(addtemplatelinks)) def addtemplatelinks(self): # TODO avoid {{in title|one_(disambiguation)}} self.addsection('Copy and Paste stuff', ''' * Only use these if applicable (i.e. you checked) * {{in title}} * {{Lookfrom}} '''.strip()) def seealso(self): try: cursor.execute("""/* dabfix.seealso LIMIT:30 */ SELECT page_title, page_title IN ( SELECT IFNULL(rd_title, pl_title) FROM page JOIN pagelinks ON pl_from = page.page_id JOIN page AS pl ON pl.page_namespace=pl_namespace AND pl.page_title=pl_title LEFT JOIN redirect ON rd_namespace=pl_namespace AND rd_from=pl.page_id WHERE pl_namespace=0 AND page.page_namespace=%s AND page.page_title = %s ) AS Linked FROM ( SELECT page.page_title FROM page JOIN categorylinks ON cl_from = page.page_id JOIN pagelinks ON page.page_id = pl_from JOIN page AS rd ON rd.page_namespace = pl_namespace AND rd.page_title = pl_title JOIN redirect ON rd_from = rd.page_id WHERE page.page_namespace = 0 AND rd_namespace = %s AND rd_title = %s AND cl_to IN ("""+",".join(("%s",)*len(self.disambiguationcategory))+""") UNION SELECT page_title FROM page JOIN categorylinks ON cl_from = page.page_id JOIN pagelinks ON pl_from = page.page_id WHERE page.page_namespace = 0 AND pl_namespace = %s AND pl_title = %s AND cl_to IN ("""+",".join(("%s",)*len(self.disambiguationcategory))+""") ) AS r; """, (self.page.namespace(), self.page.title(underscore=True),)+((self.page.namespace(), self.page.title(underscore=True),)+self.disambiguationcategory)*2) except Exception as e: self.reconnect() error("See also: %r"%(e,)) self.addsection('Auto-See also', "* FAILED %s" % e) return rows = cursor.rowcount if rows: heading(2, "See also links", className="debug") #wikipedia.output("Other disambiguation pages which link here") addlinks = {} print('<ul class="columns debug">' if rows > 6 else '<ul class="debug">') for title, exists in cursor: print('<li>%s (%s)</li>' % (CreateLink(title), "already linked" if exists else "may need to be linked")) if not exists: addlinks[title] = title.decode('utf-8').replace('_', ' ') print('</ul>') self.addsection('EXPERIMENTAL See also', bullets(addlinks.values())) def addInterwikis(self): """ SELECT * FROM page LEFT JOIN iwlinks ON iwl_from=page_id WHERE page_namespace=0 AND page_title="SAN"; """ pass def mosfixes(self): # Rename headings for old, new in ( ('Real people', 'People'), ('Historical persons', 'People'), ('Persons', 'People'), ('Fictional characters', 'Characters'), ('Movies', 'Films'), ('Computer gaming', 'Video gaming'), ('Computer games', 'Video games'), ('First name', 'Given name'), ('Last name', 'Surname'), ('Geography', 'Places'), ('Geographical locations', 'Places'), ('Place names', 'Places'), ('Miscellaneous', 'Other uses'), ('Ohter usages', 'Other uses'), ): self.text = re.sub(r'(?mi)^(=+ *)%s(?= *=+ *$)'%re.escape(old), r'\g<1>%s'%new, self.text) # Use actual headings instead of bolding def makeheading(m): # luckly the re module uses a temporary string if '\n====' in self.text[:m.end()]: depth = '=====' elif '\n===' in self.text[:m.end()]: depth = '====' elif '\n==' in self.text[:m.end()]: depth = '===' elif '\n=' in self.text: depth = '==' # Use level 3 in no other headers are present else: depth = '===' return ' '.join((depth, m.group(4)[0].upper()+m.group(4)[1:].replace("'''", ''), depth)) self.text = re.sub(r"(?mi)^('''|;|In '''|As a? *''') *(In |As |)(the |)(([\w\s]|''' and ''')+?) *('''|:| )*$", makeheading, self.text) # copied from [[MediaWiki:Disambiguationspage]] setindex_templates = ("SIA", "Given name", "Hawaiiindex", "Mountainindex", "Plant common name", "Disambig-plants", "Roadindex", "Shipindex", "Sportindex", "Surname", ) disambig_template = ( "Disambiguation", ) cursor.execute("""/* dabfix.mosfixes LIMIT:30 */ SELECT 1 FROM page JOIN templatelinks ON tl_from=page_id JOIN pagelinks ON pl_namespace=tl_namespace AND pl_title=tl_title WHERE page_namespace=%s and page_title=%s AND pl_from = (SELECT page_id FROM page WHERE page_namespace=8 AND page_title="Disambiguationspage") """, (self.page.namespace(), self.page.title(underscore=True))) if cursor.fetchall(): pass elif not re.search(r'\{\{(%s)'%'|'.join(wikilinkregex(title) for title in (disambig_template + setindex_templates)), self.text): if 'given names]]' in self.text: addtemplate = "{{given name}}" elif 'surnames]]' in self.text: addtemplate = "{{surname}}" else: addtemplate = "{{disambiguation}}" self.text = re.sub(r'(?is)\n*((\n\[\[[^[\]]+\]\]\s*|\n\{\{[^{|}[\]]+\}\}\s*)*)$', r"\n\n%s\n\1"%(addtemplate,), self.text) def template_redirect(self): # List template redirects BypassTemplateRedirect= ( # Used by our software "Disambiguation", "Hndis", "Geodis", "FORCETOC", "TOC_left", "TOC_right", "Wiktionary", #"In_title", # Used in parseline() "Spaced_ndash", "Ndash", "Mdash", ) cursor.execute("""/* dabfix.template_redirect LIMIT:30 */ SELECT rd_title, tl_title, ( SELECT 1 FROM page JOIN pagelinks ON pl_from=page_id WHERE pl_namespace=10 AND pl_title=rd_title AND page_namespace=%s AND page_title=%s ) AS awb_bypass FROM page JOIN templatelinks ON tl_from=page_id AND tl_namespace=10 JOIN page AS tpl ON tpl.page_namespace=tl_namespace AND tpl.page_title=tl_title JOIN redirect ON rd_from=tpl.page_id AND rd_namespace=10 WHERE page.page_namespace=%s AND page.page_title=%s """, (4, "AutoWikiBrowser/Template_redirects", self.page.namespace(), self.page.title(underscore=True))) for tup in cursor: (template, redirect, awb_bypass) = tuple(s.decode('utf-8') if isinstance(s, bytes) else s for s in tup) if template in BypassTemplateRedirect or awb_bypass: # TODO Capitalize if newlines are present in the template repl = template.replace('_', ' ').replace('\\', '\\\\') if True: repl = repl[0:1].lower()+repl[1:] self.text = re.sub(r'(?<=\{\{)\s*([Tt]emplate\s*:\s*|)%s\s*(?=\||\}\})' % wikilinkregex(redirect), repl, self.text) else: wikipedia.output("Transclusion redirect: [[Template:%s]] to [[Template:%s]]" % (redirect, template,)) # Remove TOCs and correctly added back with JS self.text = re.sub(r'(?i)\n?\{\{(TOC[_ ]+left|TOC[_ ]+right)\}\}\n?|\n?__(TOC|NOTOC|FORCETOC)__\n?', '\n', self.text) # TODO implement sort key #self.text = re.sub(r'\{\{hndis\s*(\|(?:\s*name\s*=\s*|(?=[^{|}=]*[|}]))([^{|}]*?))\s*\}\}', r'{{hndis|\2}}', self.text) # |name=Last, First # Use {{dismabig}} paramter features self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:surnames\]\]', r'\1|surname}}\2', self.text) self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:given names\]\]', r'\1|given name}}\2', self.text) self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:Place name disambiguation pages\]\]', r'\1|geo}}\2', self.text) self.text = re.sub(r'(?is)(\{\{disambig.*?)\}\}(.*?)\n?\[\[Category:Airport disambiguation\]\]', r'\1|airport}}\2', self.text) def intro(self): self.text = re.sub(r'(?i)\{\{(?:Disamb1|Mayrefer|Mayreferto|May refer to|Refer|Refers)(?=\s*[|}])', '{{subst:refer', self.text) if not ("'''" in self.text or "{{subst:refer" in self.text): #TODO should discard minor case varriants terms = {} for prefix in self.prefixes: #term = "'''%s%s'''" % (prefix[0], prefix[1:].replace('_', ' '),) term = "'''%s'''" % prefix.replace('_', ' ') base = strip_variations(term) if base not in terms: terms[base]=term termstext = EnglishJoin(sorted(terms.values())) wikipedia.output('Adding "%s may refer to:"'%termstext) self.text = "%s may refer to:\n\n"%termstext + self.text # It may also refer to: # '''X''' refers # '''X''' can mean: # '''X''' can mean the following things. # '''X''' can refer to: # '''X''' can refer to either: # '''X''' can refer to several things: # '''X''' can refer to the following: # '''X''' can be used to refer to: # '''X''' could mean: # '''X''' is the name of: # '''X''' may also signify: # '''X''' may be: # '''X''' may mean: # '''X''' may refer to any of the following: # '''X''' may represent # '''X''' may refer to more than one thing: # '''X''' means following # '''X''' might refer to one of the following: # '''X''' has several meanings # '''X''' has various meanings: # '''X''' has the following meanings: # A '''X''' can be: # The term '''X''' may refer to any one of the following: # The expression '''X''' can refer to: # Do you mean... # '''X''' may refer to several places: # '''X''' may refer to several places in [[L]] # '''X''' may be an abbreviation for: # '''X''' is an abbreviation for: # '''X''' is a [[three-letter acronym]] that may refer to: # '''X''' is a [[TLA|three-letter abbreviation]] and represents # '''X''' is an abbreviation that may stand for: # '''X''' as an [[abbreviation]] may refer to: # '''X''' can stand for: # The [[abbreviation]] '''X''' can be: def refer(m): if re.search(r'(?i)\{\{(Hndis|Hndab|Hndisambig|Bio-dab)', self.text): return m.expand(r'\g<subject> is\g<also> the name of:') elif re.search(r'(?i)\{\{(Geo-?dis|Geodab)', self.text): return m.expand(r'\g<subject> may\g<also> refer to several places:') elif re.search(r'\b(acronym|abbreviation|inital|stand)\b', m.group()): # or self.page.title().isupper(): return m.expand(r'\g<subject> may\g<also> stand for:') else: return m.expand(r'\g<subject> may\g<also> refer to:') self.text = re.sub(ur"^(?:A |And |As an? |The |expression |term |\[*three[ -]letter acronym\]* |\[*acronym\]* |\[*abbreviation\]* |)*(?P<subject>('''[^{|}[\]\n']+?''',?( or|) *)+|It|Did you|Do you)( can| could| is| is an?| as| as an?| has| may| might|)(?P<also> also|)([ \-]+(\[\[|\[\[[^{|}[\]<\n>]+\||)(be|be an?|means?|meanings?|refers?|represents?|signify|the name|various|used to|several|an?|abbreviation|two|three|four|five|letter|acronym|initialism|stand|to|of|for|and|that|may|one of|any of|any one of|either|following|the following|several things|more than|one thing|several persons|several people|meanings?|several places)(\]\]|\b))+( in \[\[[^{|}[\]\n]+\]\]|)( \w+|)[:;. ]*$", refer, self.text, flags=re.M | re.U | re.I) def referTemplate(m): #if m.group('subject') == "'''%s'''"%self.page.title(): title = re.search(r"'''(.+?)'''", m.group('subject')).group(1) if "'''%s'''"%title == m.group('subject'): return "{{subst:refer"+( "" if title == self.page.title() else '|'+title )+{ "is the name of": "|type=name", "may stand for": "|type=stand", # TODO add pseudo place detection "may refer to several places": "|type=place", "may also refer to":"|type=also", "may refer to": "", }.get(m.group("refer"))+"}}" else: return m.group() self.text = re.sub(r"(?P<subject>('''[^{|}[\]<\r\n>]+?'''[ ,]*( or )?)+) (?P<refer>may refer to|may also refer to|may stand for|is the name of):", referTemplate, self.text) def mosdab_suggestions(self, page_revision): logname = "/home/dispenser/public_html/logs/mosdab-%s.log" % (self.site.dbName(),) mosdab_codes = { 'B': 'Bold text on line (MOS:DABENTRY)', 'C': 'Capitalize first letter on line (MOS:DABENTRY)', 'E': 'External link', 'M': 'More than one blue link on a line (MOS:DABENTRY)', 'N': 'Red link with no blue link (MOS:DABRL)', 'O': 'More than one red link on a line (MOS:DABRL)', 'P': 'Punctuation on line (MOS:DABENTRY)', 'R': '<ref> tag', 'S': 'Unpiped #section on line', 'T': 'A link is labeled as to obscure the full title (usually dropping the " (subject)" part) (WP:PIPING)', 'X': 'Excessively long line', 'F': 'Less then two blue links on the page (WP:2DAB)', 'H': 'Huge unbroken list', 'L': 'No unordered list found', 'U': 'No links on line', } try: with open(logname, 'rb') as log: for line in log: col = line.split('\t') if col[0] == bytes(page_revision): heading(2, "Manual of Style suggestions") print "<p>This disambiguation page has some automatically generated suggestions for fixes:</p>" print '<dl class="mosdab">', '\n'.join("<dt>%s</dt><dd>%s</dd>"%(c,mosdab_codes.get(c, "Unknown code %r"%c)) for c in col[2].split()), '</dl>' break except IOError: wikipedia.output("mosdab-%s.log does not exist" % self.site.dbName()) return def run(self): if not self.page.title(): wikipedia.output(__doc__) return try: self.text = self.page.get() except wikipedia.IsRedirectPage: wikipedia.output("Converting redirect %s" % self.page.aslink()) # Content must end with a template, even {{subst:void}} self.text = re.sub(ur'(?s).*?(\[\[[^[\]\n]*?\]\]).*', ur'{{subst:refer}}\n\n* \1\n{{subst:long comment}}', self.page.get()) except wikipedia.NoPage: # XXX Messing with the internals self.page._contents = "" self.text = "{{subst:refer}}\n*\n\n{{disambiguation}}" # Why isn't NoPage a subclass of something else? try: self.page.get() except wikipedia.NoPage: wikipedia.output(u'Page %s not found ' % self.page.aslink()) return except Exception as e: wikipedia.output('Exception: %r' % (e,)) return wikipedia.output(__doc__) print '<a id="Hide_details" href="#Hide_details" class="bigbutton" onclick="toggleDebug(this); return false"><span style="float:left">▼</span> Show details <span style="float:right">▼</span></a>' headings.append("Hide details") # setup self.getprefixes() self.template_redirect() self.doubleredirect() self.primary_entry() self.mosfixes() # generate new links self.addWiktionary() self.bluelinks() self.seealso() self.redlinks() self.addtemplatelinks() self.intro() ## Commonfixes if self.usecommonfixes: heading(2, 'Common fixes', className="debug") import commonfixes self.text = commonfixes.fix(self.text, page = self.page, verbose = False) cgitb.enable(logdir='tracebacks') if len(self.text) > 300: self.text = self.text.replace('{{subst:long comment}}', '') if len(self.text) < 200 and '{{subst:long comment}}' not in self.text: self.text += "\n{{subst:long comment}}" else: # Unpipe text self.text = re.sub(r'\[\[([^{|}[\]]+)\s*\|\s*\1\s*\]\]', r'[[\1]]', self.text) self.text = re.sub(r'(?i)\[\[([^{|}[\]\n]+)([^{|}[\]\n]+)\|\1\]\]\2', r'[[\1\2]]', self.text) # run after creating blue links self.text = re.sub(r'(?<=\[\[)[^{|}[\]\n#]+(?=(?:#[^{|}[\]\n]*|) *(?:\|.*?|)\]\])', skipredirect, self.text) # music specific fixes #self.text = re.sub(r"(?uim)^\*([^,\n]*),(['\"]*) *a? *(song|signle) by ([\w [\]]* band |)(?P<group>[\w [\]]+)(?<!'s) from (his|her|their)(first |second |third |)(?P<year> \d+|)(?P<album> album .*?) *$", r"\1\2, a song on \g<group>'s\g<year>\g<album>", self.text) print '<div class="debug">' heading(2, "Entry cleanup") global prefixes; prefixes = self.prefixes self.text = re.sub(r'(?m)^[#*]+.*$', parseline, self.text) print '</div>' # Majority vote on bullet style if self.page.get().count("\n* ") < self.page.get().count("\n*") * 1 / 4: # Remove spaces self.text = re.sub(r'(?m)^([*]+) *', r'\1', self.text) elif self.page.get().count("\n* ") > self.page.get().count("\n*") * 3 / 4: # Add spaces self.text = re.sub(r'(?m)^([*]+) *', r'\1 ', self.text) else: pass self.text = re.sub(r"(m)^(?P<subject>('''[^{|}[\]\n']+?''',?( or|) *)+) may be an abbreviation for[:;]? *$", r"\g<subject> may stand for:", self.text) heading(2, "Diff") wikipedia.showDiff(self.page.get(), self.text) heading(2, "Timeline", className="debug") print '<pre class="debug">%s</pre>' % (wikipedia.escape(wikipedia.timereport()),) heading(2, "Edit box", className="debug") cursor.execute("/* dabfix LIMIT:30 */SELECT page_latest FROM page WHERE page_namespace=%s AND page_title=%s", (self.page.namespace(), self.page.titleWithoutNamespace(underscore=True),)) if (self.page.revisionid, ) != cursor.fetchone() and self.page.revisionid: from dab_solver import OutOfSync print '<div class="mw-warning">%s</div>'%wikipedia.translate(self.site, OutOfSync) print ''' <div id="autoremove" style="clear:both; text-align:center;"> <button onclick="removeDescriptions()">Remove auto-descriptions</button> <button onclick="removeSections()">Remove auto sections</button> <!--button onclick="toggleCleanup()">Tag/Untag for cleanup</button--> </div>''' self.page.put(self.text, comment="Cleanup per [[WP:MOSDAB]] using [[tools:~dispenser/cgi-bin/dabfix.py|Dabfix]]") self.mosdab_suggestions(self.page.revisionid) print '<div class="debug columns" style="clear:both;">' print '<form action="javascript://">' def optsummary(name, label, checked=False): htmlout('<input id="%s" type="checkbox" %s/><label for="%s">%s</label><br/>', (name, ' checked="checked"' if checked else '', name, label)) summaries = r""" Edit summary builder <DROPDOWN, COMMON DEFAULTS> javascript:alert("One blue link per line per [[MOS:DABENTRIES]]; No external links per [[MOS:DABENTRIES]]; remove puncuation per [[MOS:DABENTRIES]];".replace(/(per [\w: [\]]+; )(.*?)(?=\1)/g, 'and $2')) Custom with each linking MOS:DAB section Cleaned out advertising Fixed lifespans per --- [[MOS:DABOTHERLANG]] [[MOS:DABNAME]] [[MOS:DABENTRIES]] [[MOS:DABINT]] Fix section [[MOS:DABMENTION]] [[MOS:PIPING]] remove pipe [[MOS:DABRL]] [[MOS:DABSY]] shortened bios, per MOS:DAB; there should only be enough to differentiate Remove Format with quotation marks and italics Ordered entried alpabetically Heading ... * TOC in first section only Disambig page style repair ---- edit per WP:MOSDAB [ ] [ ] Repaired link to disambiguation page: [[Wikipedia:WikiProject Disambiguation|please help]] """ summaries = r""" remove puncuation unpiping no external links one blue link per line removing red link without blue cleanup per [[WP:MOSDAB]] using [[tools:~dispenser/cgi-bin/dabfix.py|Dabfix]] """ count=0 for summary in summaries.strip().split('\n'): optsummary("summery%d"%count, summary) count+=1 print '</form>' print '</div>' try: cursor.execute('''/* dabfix */ SELECT pb_title AS "Project", COUNT(*) AS "Links to here" FROM pagelinks JOIN u_dispenser_p.projectbanner ON pb_page = pl_from WHERE pl_namespace=0 AND pl_title IN (%s) GROUP BY pb_title ORDER BY 2 DESC LIMIT 10 ''' % ','.join(('%s',)*(1+len(self.redirects))), tuple([self.page.titleWithoutNamespace(underscore=True)]+self.redirects)) print '<div style="clear:both;">Pages that link here belong to the following wikiprojects</div>' print '<table class="wikitable sortable">' print '<tr>' for tup in cursor.description: print '<th>%s</th>'%tup[0] print '</tr>' for tup in cursor: print '<tr>' print '\n'.join('<td>%s</td>'%str(item) for item in tup) print '<tr>' print '</table>' except: pass # Floating ToC print '<div class="portlet quickjump debug">' print '<h5>Table of contents</h5>' print '<div class="pBody"><ul>' for section in headings: print '<li><a href="#%s">%s</a></li>' % ( wikipedia.sectionencode(section), section) print '</ul></div></div>' def main(): print '''<div class="toccolours" style="padding:0.5em; float:right; text-align:center;"> <b>Bookmarklet</b><br/> <a href="javascript:location='//toolserver.org/~dispenser/cgi-bin/dabfix.py/'+wgPageName+'?client=bookmark';">DabFix bookmarklet</a><br/> <a href="//toolserver.org/~dispenser/cgi-bin/godab.py?tool=dabfix.py&file-random=logs/mosdab-enwiki.log">Random cleanup</a> (<a href="//toolserver.org/~dispenser/logs/mosdab-enwiki.log">list</a>)<br/> <b>Questions? Comments?</b><br/> Live chat: <a href="irc://irc.freenode.net/%23%23dispenser">irc</a> <b>·</b> <a href="//webchat.freenode.net/?channels=%23%23dispenser">web</a><br/> </div>''' robot = Robot() robot.run() if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: wikipedia.startContent(form=True, head=r"""<style type="text/css"> #mw_portlets { z-index:1; } .quickjump { position:fixed; _position:absolute; background-color:#fff; background-color:rgba(255, 255, 255, .8); border:1px solid #ccc; right:1em; top:16em; width:13em; } #wikiPreview { background-color:#ffe4c4; border:2px inset; padding:0.3em; } .info { background: url(//upload.wikimedia.org/wikipedia/commons/7/75/Information-silk.png) no-repeat right; } .warn { background: url(//upload.wikimedia.org/wikipedia/commons/4/49/Error.png) no-repeat right; } .error { background: url(//upload.wikimedia.org/wikipedia/commons/c/c0/Exclamation.png) no-repeat right; } a.bigbutton { background: #eee; border: 1px solid; /* clear:both; /* */ color: #777; display: block; font: bold 2em/200% sans-serif; padding:0 3em; text-align: center; text-decoration:none; } a.bigbutton:hover { color:#000; } .columns { -webkit-column-width:30em; -moz-column-width:30em; list-style-position: inside; /* WebKit bug #23053 */ } .columns ul { -webkit-column-span:1; -moz-column-span:1; column-span:1; -webkit-break-inside:avoid; -moz-break-inside:avoid; break-inside:avoid; } .columns .keeptogether { display: inline-block; width: 100%; } samp { background:#eee; border:1px solid navy; border-radius:4px; -moz-border-radius:4px; -webkit-border-radius:4px; padding:2px; } .autoremove { clear:both; } a.rl_expand { border-bottom:1px dotted blue; } dl.mosdab dt { display:none; } dl.mosdab dd { display: list-item; } /* WikEd extra styles */ #wikEdInputWrapper { background:#eee; padding:2px 0.5em 0; } #wikEdDiffWrapper { display:none; } </style><style type="text/css" id="debugstyle"> .debug { display:none; } </style><script type="text/javascript">//<![CDATA[ var autosection = []; var autosectionname = []; var autodescript = []; var autodescripttitle = []; var autodescriptremoves = [] function AddedSection(section, section_R) { autosection.push(section_R); autosectionname.push(section); } function removeSections() { if (wikEd.useWikEd) { wikEd.UpdateTextarea(); wikEd.textareaUpdated = true; } var editbox = document.getElementById('wpTextbox1'); var failList = []; for (var i=0; (s=autosection[i])!=null;i++) { if(!editbox.value.match(s)) { failList.push(autosectionname[i]); } editbox.value = editbox.value.replace(s, "\r\n\r\n"); } if (wikEd.useWikEd) { wikEd.UpdateFrame(); } if (failList.length > 0) { alert("Unable to remove the following sections:\n\n"+failList.join('\n') ); } } function AddedDescription(title, s) { autodescript.push(s); autodescripttitle.push(title); autodescriptremoves.push(0); } function fixDescript(desc, string) { // XXX how is this suppose to work? // Dates are added in another stage, so we attempt to // match that pattern if this match fails if(string.indexOf(desc)<0) { desc = desc.replace(/^,( the| an| a)(?= \d+|\b)/, ","); } return desc; } function removeDescriptions() { if(wikEd.useWikEd) { wikEd.UpdateTextarea(); wikEd.textareaUpdated = true; } var editbox = document.getElementById('wpTextbox1'); var failList = []; var s; for(var i=0; (s=autodescript[i])!=null;i++) { s = fixDescript(s, editbox.value); if (editbox.value.indexOf(s) == -1) { failList.push("*[["+autodescripttitle[i]+"]]"+autodescript[i]); } else { autodescriptremoves += 1; } editbox.value = editbox.value.replace(s, ""); } if(wikEd.useWikEd) { wikEd.UpdateFrame(); } if(failList.length) { alert("Failed to removed "+(failList.length)+" of "+(i)+"\n----\n"+failList.join('\n') ); } else { alert("Removed all "+i); } } function HighlightDescriptions() { function htmlescape(str){ return str.replace(/&/g, '&').replace(/>/g, '>').replace(/</g, '<'); } wikEd.UpdateTextarea(); var html = wikEd.textarea.value; html = htmlescape(html); var s; for(var i=0; (s=autodescript[i])!=null;i++) { s = htmlescape(s); s = fixDescript(s, html); html = html.replace(s, '<span class="wikEdKeep autodesc">$&</span>'); } if(wikEd.useWikEd) { wikEd.UpdateFrame(html); } else { wikEd.UpdateTextarea(html.replace(/\n/g, '<br/>')); } } addOnloadHook(function(){ var editform = document.getElementById('editform') if(editform && autosection) { editform.onsubmit = function(){ var editbox = document.getElementById('wpTextbox1'); function countAD() { var count = 0; for(var i=0; (s=autodescript[i])!=null; i++) if(editbox.value.indexOf(fixDescript(s, editbox.value))!=-1) count++; return count; } adTotal = autodescript.length - countAD(); for(var m, i=0; (s=autosection[i])!=null; i++) if(m=editbox.value.match(s)){ if((m[2]||"").match(/^\s*$/)) { editbox.value = editbox.value.replace(m[0], "\n"); // FIXME update wiked continue; } if(confirm("Do you want to remove the auto sections now?")) removeSections(); else return false; break; } // Useful bug: Doesn't work right after removeSections() var count = countAD(); var ratio = Math.round(1000 * count / (adTotal + count)) / 10; if(ratio > 80) { if(!confirm((ratio==100?"All ":ratio+"% of ")+"descriptions remain in their automatic form. \nUse 'Remove auto descriptions' button to remove unmodified auto descriptions. \n\nContinue anyway?")) return false; } editbox.value = editbox.value.replace(/\n+(?=\n\n)/g, ""); // remove double newlines editbox.value = editbox.value.replace(/^\s+|\s+$/g, ""); // Trim field if((editbox.value.match(/[\n\r](=+).*?\1/g)||"").length<=3) { // Avoid {{tocright}} with infoboxes if(!editbox.value.match(/\{\{Infobox/i)) { editbox.value = editbox.value.replace(/\n?\{\{tocright\}\} *\n?/,"\n"); } } else if(editbox.value.indexOf("{{tocright}}")==-1) { editbox.value = editbox.value.replace(/^([\s\S]*?)([\r\n]=+.*?=+ *[\r\n])/, "$1\n{{tocright}}$2"); } else { // exists } if(editbox.value.length > 300) editbox.value = editbox.value.replace("{{subst:long comment}}", ""); //if(wikEd.useWikEd)wikEd.UpdateFrame(); warnOnLeave=false; return true; } } }) function toggleNode(node) { return node.style.display=(node.style.display?'':'none'); } function toggleDebug(node) { var debugstyle = document.getElementById("debugstyle"); if(debugstyle.disabled){ node.innerHTML = '<span style="float:left">▼</span> Show details <span style="float:right">▼</span>'; debugstyle.disabled = false; } else { node.innerHTML = '<span style="float:left">▲</span> Hide details <span style="float:right">▲</span>'; debugstyle.disabled = true; } } function toggleCleanup() { if(wikEd.useWikEd) { wikEd.UpdateTextarea(); wikEd.textareaUpdated = true; } var editbox = document.getElementById('wpTextbox1'); editbox.value = editbox.value.replace("\{\{disambiguation", "{{disambig-cleanup").replace("{{disambig-cleanup-cleanup", "{{disambiguation"); if(wikEd.useWikEd) { wikEd.UpdateFrame(); } } function CommentHandler(obj) { WikEdGetText(obj, 'selection, cursor'); obj.changed = (obj.selection.plain != '' ? obj.selection : obj.cursor); // make the changes to the plain target text if ( /<!--([\s\S]*?)-->/g.test(obj.changed.plain) ) { // remove formatting obj.changed.plain = obj.changed.plain.replace(/<!--([\s\S]*?)-->/g, '$1'); } else { // add formatting obj.changed.plain = '<!--' + obj.changed.plain + '-->'; } // keep the changed text selected, needed to remove the formatting with a second custom button click obj.changed.keepSel = true; return; } // load WikEd wikEd = { useWikEd: null, config: { /* disable jumping around */ doCloneWarnings: false, focusEdit: false, scrollToEdit: false, //wikEdNoRearrange: true, /* disable AJAX functions */ autoUpdate: false, useAjaxPreview: false, /* enable enhanced diff */ loadDiff: true, /* enable InstaView */ loadInstaView: true, LinkifyArticlePath: "//en.wikipedia.org/wiki/$1", // FIXME wgServer doesn't exist until after the script loads // LinkifyArticlePath: wgServer+wgArticlePath, /* Customize buttons */ buttonBar: { 'format': ['wikEdButtonBarFormat', 'wikEdButtonBarFormat', 'wikEdButtonsFormat', 'wikEdButtonsFormat', 44, 'wikEdGripFormat',[ 13,01,02,14,11,80,24,17,'br', 03,04,07,15,10,12,19,21 ] ], 'fix': ['wikEdButtonBarFix', 'wikEdButtonBarFix', 'wikEdButtonsFix', 'wikEdButtonsFix', 44, 'wikEdGripFix', [ 52,53,55,56,'br', 58,59 ] ], 'control': ['wikEdButtonBarControl', 'wikEdButtonBarControl', 'wikEdButtonsControl', 'wikEdButtonsControl', 44, 'wikEdGripControl', [ 29,35,30,37,'br', 77,34,33 ] ], 'textify': ['wikEdButtonBarTextify', 'wikEdButtonBarTextify', 'wikEdButtonsTextify', 'wikEdButtonsTextify', 44, 'wikEdGripTextify', [26,27, 101, 100] ] }, frameCSS: { '.autodesc': 'background-color: #ffd700; color: black;' }, button: { 100: ['wikEdComment', 'wikEdButton', 'Comment out', '//upload.wikimedia.org/wikipedia/en/3/34/Button_hide_comment.png', '23', '23', 'DIV', 'WikEdEditButton(this, this.id, null, CommentHandler);' ], 101: ['wikEdHighDesc', 'wikEdButton', 'Highlight auto-descriptions', '//upload.wikimedia.org/wikipedia/commons/c/cb/Button_S_yellow_author.png', '23', '23', 'Descriptions', 'HighlightDescriptions()' ] } } } if (navigator.appName != 'Microsoft Internet Explorer') { importScriptURI('//en.wikipedia.org/w/index.php?title=User:Cacycle/wikEd.js&action=raw&ctype=text/javascript'); function FixInstaView(){ if(typeof(InstaView)=="undefined" || !InstaView.conf)return; InstaView.conf.paths = { articles: wgServer+'/wiki/', math: wgServer+'/math/', images: '', images_fallback: '//upload.wikimedia.org/wikipedia/commons/', magnify_icon: wgServer+'/skins-1.5/common/images/magnify-clip.png' } clearInterval(fiv_timer) } var fiv_timer = setInterval("FixInstaView()", 1000) } //]]></script>""") main() finally: wikipedia.endContent() wikipedia.stopme()