#!/usr/bin/env python # -*- coding: utf-8 -*- import cgi, re, time import wikipedia submitbutton = 'wpDiff' #dead_templates = r'[Dd]ead[ _]*link|[Dd]l|[Dd]l-s|404|[Bb]roken[ _]+link|[Cc]leanup-link' dead_templates = r'[Dd]eadlink|[Dd]l|[Dd]l-s|404|[Bb]roken[ _]+link|[Cc]leanup-link' removeDeadTemplate = re.compile(ur'((\[XxNEEDLExX[^]]*?\]|\{\{[^{}]*XxNEEDLExX[^{}]*\}\})(\s*|))(\s*?\{\{[Dd]ead link[^}]*\}\})+', re.DOTALL) now = time.strftime("%Y-%m-%d") def getfirst(dict, name, defaultValue=None): return dict.get(name, [defaultValue])[0] def removeDeadNote(text): # replaces the {{dead link}} template suceding the XxNEEDLExX value return removeDeadTemplate.sub(r'\1', text) def removeDuplicate(text): """ Remove the newest duplicate {{dead link}} tag """ #FIXME english only m = re.compile('(\{\{[Dd]ead link[^}]*?\}\})+(()?\{\{[Dd]ead link[^}]*?\}\})') text = m.sub(r'\2', text) # Requested by [[User:Tim1357]] if not re.search(dead_templates, text): text = re.sub(r'(?P\n)?\{\{\s*(?:[Tt]emplate:|)(?:[Dd]ead[ _]+link[ _]+header|[Dd]ead[ _]+links|[Dd]eadlinks)(?:\|[^{}]*|)\s*\}\}(?(nl)\n?)', '', text) return text alphanum = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" def webCiteShortId(t): """ WebCite's short identifier is the time measure in microsecond since 1970 of the date of the archive request stored as a base-62 number. """ s = "" while (t >= 1): s = alphanum[t%62] + s t /= 62 # python 2 integer division return s def webCiteTime(id): t = 0 for s in id: t *= 62 t += alphanum.index(s) return t def main(): print '' print 'mergeChanges.py' print '' print '' # TODO replace query w/ form after testing #form = cgi.FieldStorage(keep_blank_values=True) query = cgi.parse(keep_blank_values=True) if query == {}: print "No form fields given" return page = wikipedia.Page( wikipedia.getSite(), getfirst(query, 'title', '') ) #page = wikipedia.Page( wikipedia.getSite(), form.getfirst('title', '') ) print '' print '' print '

%s

' % page.title().encode('utf-8') # Some clients have trouble with the javascript submittion if page.title() == '': print '

No page title has been given

This may be cause by a bug in your browser; Safari 4.0.1 and some version of Firefox do not handle the combined POST request and query strings correctly. Please report this bug to your vender (typically in the help menu).


' #print "
%r
"%query cgi.print_environ() f = open('./tracebacks/mergechanges.py', "w") f.write("
%r
\n
%r
"%(cgi.os.environ, query)) #return print '

Preparing diff, this page will auto submit with JavaScript.

' print 'loading...' print '

' # Should we run reflinks on this? reflinks = 'reflinks' in query.get('addons',[]) #reflinks = 'reflinks' in form.getlist('addons') wpSummary = '' text = page.get() if 'commonfixes' in query.get('addons',[]): #if 'commonfixes' in form.getlist('addons'): import commonfixes text = commonfixes.fix(page=page, text=text) # Use "Dead link" for all the dead link templates, # Capitlized since Smackbot/AWB "corrects" it text = re.sub(r'\{\{\s*(%s)(?=\s*[|}])' % dead_templates, r'{{Dead link', text) # Since we get the browser's normalized URL we need to proform the same normalization to properly match the two #TODO un-normalize? text = re.sub(r'(\[http[s]?://[A-Za-z0-9\-.:]+\w+)([ \]])', r'\1/\2', text) # adds / to the end of domains print '%r'%query #print '%r'%form for (action, list) in query.iteritems(): #for (action, list) in form.iteritems():# FIXME try: needle = unicode(list[0], 'utf-8') except UnicodeDecodeError: needle = unicode(list[0], 'latin1') text = text.replace(needle, 'XxNEEDLExX') repl = ''# None if len(list)>1: if not 'XxNEEDLExX' in text and action not in ('addons',): #HACK print('
Cannot find string: %s
' % needle.encode('utf-8')) try: repl = unicode(list[1], 'utf-8') except UnicodeDecodeError: repl = unicode(list[1], 'latin1') repl = wikipedia.html2unicode(repl) # \ is escaped as to avoid \1 in strings, \\ will be intrupited as \ # Cannot use re.escape() since the escapement is not undone by re.sub() repl = repl.replace("\\", '\\\\') if action == "wpSummary": try: wpSummary = unicode(list[0], 'utf-8') except: wpSummary = unicode(list[0], 'latin-1') elif action == "wpSection": pass # Which submit button should the javascript click elif action in ('wpSave', 'wpPreview','wpDiff'): submitbutton = action elif action in ("title", "ServerPath", "addons"): # actions are performed elsewhere pass # Regex elif action.startswith('regex'): p = re.compile(needle) if p.search(text): text = p.sub(repl, text) else: print('
Append error: Not in template or bracketed link: %s
' % needle.encode('utf-8')) # {{dead link}} elif action.startswith('append'): #p = re.compile(r'(\[XxNEEDLExX.*?\]|\{\{[^{}]*=\s*XxNEEDLExX[^{}]*\}\}) *?(|)') p = re.compile(r'(\[XxNEEDLExX[^]\n]*?\]|\{\{[^{}]*=\s*XxNEEDLExX[^{}]*\}\})()') text = re.sub(r'(?<=[>])\s*XxNEEDLExX\s*(?=)', r'[XxNEEDLExX]', text) text = re.sub(r'(\n[*#:;]+ *|[<>"\]] *)XxNEEDLExX(?=[][<>\s"])', r'\1[XxNEEDLExX]', text) if p.search(text): # Append repl text = p.sub(r'\g<1>\g<2>%s' % repl, text) # remove double {{dead link}} text = removeDuplicate(text) elif needle == '__START__': text = repl + text elif needle == '__END__': text = text.rstrip() + repl else: print('
Append error: Not in template or bracketed link: %s
' % needle.encode('utf-8')) # {{fact}} elif action.startswith('replacereference'): p = re.compile(r'\[XxNEEDLExX( [^]\n]*|)\]|\{\{[^{}]*=\s*XxNEEDLExX[^{}]*\}\}|(?<=[>])\s*XxNEEDLExX\s*(?=)') if p.search(text): text = removeDeadNote(text) text = p.sub(repl or "", text) else: print('
Replace link error: Not in template or bracketed link: %s
' % needle.encode('utf-8')) elif action.startswith("unlink"): text = re.sub(r'(?<=[>])\s*XxNEEDLExX\s*(?=)', r'[XxNEEDLExX]', text) wikipedia.output( re.search(r'.{0,20}XxNEEDLExX.{0,20}', text).group()) if "[XxNEEDLExX" in text: text = removeDeadNote(text) text = re.sub(r'\[XxNEEDLExX([^]\n]*)\]', r'\1', text) elif re.search(r'url\s*=\s*XxNEEDLExX', text): text = removeDeadNote(text) text = re.sub(r'\|\s*url\s*=\s*XxNEEDLExX\b\s*', r'', text) #TODO: added archiveurl remover (maynot be used much...) for m in re.finditer(r'\{\{\s*([Cc]ite[ \-_]*[Ww]eb|[Ww]eb[ _]*refernce|[Ww]eb cite|[Cc]ite|[Cc]itation)([^{}]*)\}\}', text): if not re.search('\|\s*url\s*=\s*(?![|}])', m.group()): # typically cite news will substitue text = text.replace(m.group(), m.expand(r'{{cite news\2}}')) else: #print '%s'% text.encode('utf-8') print '
Unlink error: Not in template or bracketed link: %s
' % needle.encode('utf-8') wikipedia.output( re.search(r'.{0,20}XxNEEDLExX.{0,20}', text).group()) # Archive/Replace URL elif action.startswith('substitute') or action.startswith('replace') or action.startswith('archive') or action.startswith('replacelink'): # Archiveurl, just complicated archivedate = None # WebCite m = re.search(r'webcitation.org/([0-9A-Za-z]{9})', repl) if m: archivedate = time.strftime("%Y-%m-%d", time.gmtime(webCiteTime(m.group(1))/1000000)) # Wayback Machine m = re.search(r'^https?://web\.archive\.org/web/(199[6-9]|20\d\d)(0[1-9]|1[0-2])([0-3]\d)[012]\d[0-5]\d\d\d/.*$', repl) if m: archivedate = m.expand(r'\1-\2-\3') if not archivedate and action.startswith('archive'): wikipedia.output('\03{darkred}Warning\03{default} : Bad archivedate, replacing with current date.') archivedate = now # #FIXME replace multiple! # if archivedate and re.search(r'\[XxNEEDLExX(?=[<>"[\]\s])', text): # text = removeDeadNote(text) # text = re.sub(r'') R = re.compile(r'(?i)\{\{\s*(?!dead link)(?:[^{}]+)\|\s*url\s*=\s*XxNEEDLExX\s*(\}\}|\|)', re.I) if archivedate and R.search(text): if re.search(r'url\s*=\s*XxNEEDLExX[^}]*archiveurl\s*=\s*[^\s|}]|archiveurl\s*=\s*[^\s|}][^}]*url\s*=\s*XxNEEDLExX', text): wikipedia.output('\03{darkred}ERROR\03{default} archiveurl= already used, skipping (%s)' % repl) else: text = removeDeadNote(text) # If inside a template, use the archiveurl= feature # There is some weird bug in the regular expression engine, it seems to try and match double digit back references #wikipedia.output(str(re.search(ur'(?us)((\s*\|\s*)url(\s*=\s*)XxNEEDLExX)', text).groups())) #wikipedia.output(ur'\1\2archiveurl\3%s\2archivedate\3%s' % (repl, archivedate)) text= re.sub(ur'(?us)((\s*\|\s*)url(\s*=\s*)XxNEEDLExX)(?=\s*[|}])', ur'\g<1>\g<2>archiveurl\g<3>%s\g<2>archivedate\g<3>%s' % (repl, archivedate), text, 1) elif action.startswith('archive'): if "[XxNEEDLExX" in text: text = removeDeadNote(text) text = re.sub(ur'(?u)\[(XxNEEDLExX) *((?<= )[^\]\n]+?|)\]', ur'{{cite web |url=\1 |title=\2 |archiveurl=%s |archivedate=%s}}' % (repl, archivedate), text) ## needs code for other archiving resources #text = re.sub(r'\[(XxNEEDLExX)( +([^]]+))?\]', r'{{waybackdate|site=\1|title=\3|date=%s}}' % archivedate, text) else: if "XxNEEDLExX" in text: print '''
Error: Could not substitute url.
This may happen because the URL is "free" (without a titlte) or something else.
%s
'''%re.search(r'\b.{0,20}XxNEEDLExX.{0,20}\b', text).group().encode('utf-8') else: print '
Error: Could not find url.
' else: # Substitute the current link with a new link text = removeDeadNote(text) # Python doesn't support (?\s"]*) # Bug we assume human review of the replace (hopefully, wont be human bots) text = re.sub(r'(?"\s\|\}])', r'%s\g<1>'%repl, text) # Update accessdate entries elif action.startswith("updateaccessdate"): #FIXME not quite correct - what is? for l in re.finditer(r'\{\{[^}]+XxNEEDLExX.*?\}\}', text, re.DOTALL): reftext = l.group() reftext = re.sub(r'(\|\s*accessdate\s*= ??)(?=\n* *[{|}])', time.strftime(r'\g<1>%Y-%m-%d'), reftext) if not re.search('\|\s*accessdate=\s*=\s*', reftext): reftext = re.sub(r'(\{\{[^{}]+?)((\s*\|\s*)[^[=\]{|}]+?(\s*= *)[^{|}]+?)(\s*)\}\}', time.strftime(r'\1\2\3accessdate\g<4>%Y-%m-%d\5}}'), reftext) text = text.replace(l.group(), reftext) # Print alerts if something was not used else: print '
Unused: %s\t= 
' % (action, action, wikipedia.unicode2html(needle)) text = text.replace('XxNEEDLExX', needle) # HACK text = text.replace(' ', '') ## HACK text = re.sub(ur'(\{\{(?:[Cc]ite web|[Cc]ite news)[^{}]*?)(\s*\|\s*)url(\s*=\s*)(?Phttp://web.archive.org/web/(?P\d{4})(?P\d{2})(?P\d{2})\d{6}/(?Phttp://[^][{}|<>"\s]+))(?=\s*[{|}])', ur'\1\2url\3\g\2archiveurl\3\g\2archivedate\3\g-\g-\g', text) # Remove ajacent {{dead link}} text = removeDuplicate(text) if reflinks: try: import reflinks def my_reflinks_put_page(self, page, new): self.page = page self.new_text = new reflinks.ReferencesRobot.put_page=my_reflinks_put_page except ImportError: wikipedia.output('Unable to import reflinks') reflinks = None # Hackist hook page._contents = text if page.get() != text: wikipedia.output("Injected text wasn't returned with page.get() !") elif reflinks.linksInRef.search(text): reflinksbot = reflinks.ReferencesRobot(iter([page])) reflinksbot.run() if hasattr(reflinksbot, 'new_text'): if reflinksbot.page != page:raise 'pages not the same' text = reflinksbot.new_text # remove extra {{dead link}} added by reflinks text = removeDuplicate(text) page.put(text, wpSummary) else: page.put(text, wpSummary) # click the submit button print '' % submitbutton print "" print '' if __name__ == "__main__" and wikipedia.handleUrlAndHeader(connicalize=False): try: print '' print '' main() finally: print ''