#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Reduced function set wikipedia.py for website access Notes: While reading through parts of the pyWikipedia code I have noticed that it underultaizes itself == Converting scripts into tools== Add to top: #!/usr/bin/env python Change the loader to the following: if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: wikipedia.startContent() main() finally: wikipedia.endContent() wikipedia.stopme() ''' import os, re, sys, time, urllib # pywikipedia imports import config # HTML debugging import cgitb; cgitb.enable(logdir='tracebacks') ## Python on Solaris handle SIGPIPE different than it does under bash linux #import signal; signal.signal(signal.SIGPIPE,signal.SIG_DFL) startTime = time.time() supportedLanguages = ( # Languages with over 100,000 articles 'ca', 'cs', 'de', 'en', 'es', 'eo', 'fr', 'it', 'hu', 'nl', 'ja', 'no', 'pl', 'pt', 'ru', 'ro', 'fi', 'sv', 'tr', 'uk', 'vo', 'zh', # Languages with over 10,000 articles 'af', 'ar', 'ast', 'ht', 'az', 'bn', 'be', 'be-x-old', 'bpy', 'bs', 'br', 'bg', 'cy', 'da', 'et', 'el', 'eu', 'fa', 'gl', 'ko', 'hi', 'hr', 'io', 'id', 'is', 'he', 'jv', 'ka', 'ku', 'la', 'lv', 'lb', 'lt', 'mk', 'mr', 'ms', 'new', 'nn', 'nap', 'oc', 'pms', 'nds', 'sq', 'scn', 'simple', 'ceb', 'sk', 'sl', 'sr', 'sh', 'su', 'tl', 'ta', 'te', 'th', 'vi', 'wa', # Languages with over 1,000 articles 'als', 'am', 'an', 'roa-rup', 'frp', 'zh-min-nan', 'map-bms', 'bh', 'bar', 'co', 'cv', 'pdc', 'dv', 'ang', 'fo', 'fy', 'fur', 'ga', 'gv', 'gd', 'gan', 'gu', 'zh-classical', 'haw', 'hy', 'hsb', 'ilo', 'ia', 'ie', 'os', 'kn', 'pam', 'csb', 'kw', 'km', 'lad', 'lij', 'li', 'ln', 'lmo', 'ml', 'mt', 'mi', 'mn', 'nah', 'nds-nl', 'ne', 'nrm', 'nov', 'uz', 'pi', 'pag', 'ps', 'kk', 'ksh', 'rm', 'qu', 'sa', 'se', 'sco', 'sw', 'roa-tara', 'tt', 'tg', 'to', 'tk', 'ur', 'vec', 'fiu-vro', 'vls', 'war', 'wuu', 'yi', 'zh-yue', 'yo', 'diq', 'bat-smg', # Hack for weird families (also present in pywikipedia) 'm', 'meta', 'commons', ) def datafilepath(*filename): path = os.path.normpath(os.path.join('../resources/', *filename)) dirs = os.path.dirname(path) if not os.path.exists(dirs): os.makedirs(dirs) return path families = { 'w': 'wikipedia', 'wikt': 'wiktionary', 'n': 'wikinews', 'b': 'wikibooks', 'q': 'wikiquote', 's': 'wikisource', 'v': 'wikiversity', 'wmf': 'wikimediafoundation', # See supportedLanguages above for the working version 'm': 'meta.wikimedia', 'mw': 'mediawiki', 'commons': 'commons', } namespaces = { -2: u'Media', -1: u'Special', 0: None, 1: u'Talk', 2: u'User', 3: u'User talk', 4: u'Wikipedia', 5: u'Wikipedia talk', 6: u'File', 7: u'File talk', 8: u'Mediawiki', # Capitlization fudged FIXME 9: u'Mediawiki talk', 10: u'Template', 11: u'Template talk', 12: u'Help', 13: u'Help talk', 14: u'Category', 15: u'Category talk', # Custom NS 100: u'Portal', 101: u'Portal talk', 108: u'Book', 109: u'Book talk', } class Family(object): def __init__(self, name): if name in families.keys(): self.name = families[name] else: self.name = name self.shortname = name # FIXME self.namespaces = namespaces def __repr__(self): return '%s.Family(%r)'%(__name__, self.name) def dbName(self, code): # returns the name of the MySQL database if self.name != 'wikipedia': return '%s%s' % (code, self.name) else: return '%swiki' % (code) class Site(object): def __init__(self, code, fam=None): self.lang = code.lower().encode('ascii') if fam: if type(fam) == type("string"): self.family = Family(fam) else: self.family = fam else: self.family = MySite.family self.messages = False def __repr__(self): return '%s.Site(%r, %r)'%(__name__, self.lang, self.family) def getUrl(self, path, retry = True, sysop = False, data = None, compress = True): url = '%s://%s%s' % (self.protocol(), self.hostname(), path) print('' % (url.encode('utf-8'),)) uo = MyURLopener() f = uo.open(url, urllib.urlencode(data or '') or None) charset = 'utf-8' # TODO: include magic number check '\x1f\x8b' at bytes 0-1 if f.info().get('Content-Encoding') in ('gzip', 'x-gzip'): import StringIO, gzip return gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() # raise ServerError("G-zip decoding not implemented. Purge the cache of %s to make it work again"%url) else: return f.read() def protocol(self): return 'http' def hostname(self): return '.'.join((self.lang, self.family.name, 'org')) def path(self): return '/w/index.php' def apipath(self): return '/w/api.php' def encoding(self): return 'utf-8' def language(self): return self.lang def sitename(self): return ':'.join([self.family.name, self.lang]) def isInterwikiLink(self, s): return any([s.startswith('%s:'%iw) or s.startswith(':%s:'%iw) for iw in supportedLanguages]) def namespace(self, num): return namespaces[num] or '' def category_namespaces(self): return (namespaces[14], ) def page_action_address(self, s, action): return '%s?title=%s&action=%s' % (self.path(), s, action) def put_address(self, s): return '%s?title=%s&action=submit' % (self.path(), s) def get_address(self, s): return '%s?title=%s&redirect=no' % (self.path(), s) def nice_get_address(self, s): return '/wiki/%s' % (s) def dbName(self): """Return MySQL database name.""" return self.family.dbName(self.lang.replace('-', '_')) class Page(object): def __init__(self, site, title, insite=None, defaultNamespace=None): self._site = site self._namespace = None try: title = unicode(title) except: try:title = unicode(title, 'utf-8') except:title = unicode(title, 'latin-1') # FIXME: remove need to test for ASCII if title.encode('ascii', 'ignore') == title: try: title = unicode(urllib.unquote(title.encode('ascii')), 'utf-8') except: raise NoPage('Invalid esacped character in [[%s]]!' % title) title = html2unicode(title) # decode html references title = title.replace('_', ' ').strip() for part in title.split(':')[:-1]: prefix = part.lower().strip() # FIXME support ":title" in defaultNamespace # FIXME support other languages # FIXME better reverse lookup if prefix == '': self._namespace = 0 title = title[len(prefix)+1:] break elif prefix.capitalize() in namespaces.values(): #self._namespace = dict(zip(namespaces.values(), namespaces.keys())).index(prefix.capitalize()) # Now without the for loops? self._namespace = namespaces.keys()[namespaces.values().index(prefix.capitalize())] break elif prefix in supportedLanguages: # Changing Languages if prefix!=self._site.lang: self._site = Site(prefix, self._site.family) title = title[len(prefix)+1:] elif prefix in families.keys(): # Changing Families if prefix!=self._site.family.shortname: self._site = Site(self._site.lang, Family(prefix)) title = title[len(prefix)+1:] else: break # Clean up title # FIXME better code if self._namespace == None: self._namespace = defaultNamespace or 0 if defaultNamespace: title = namespaces[defaultNamespace]+':'+title self._title = title.strip()# is this needed now??? -> .replace('_', ' ').strip('\t :') def __repr__(self): return '%s.Page(%r, %r)'%(__name__, self._site, self._title) def site(self): return self._site def latestRevision(self): return 0 def canBeEdited(self): return True def isTalkPage(self): return self._namespace % 2 == 1 def protocol(self): return self._site.protocol() def namespace(self): return self._namespace def hostname(self): return self._site.hostname() def path(self): return '/w/index.php' def urlname(self): return urllib.quote(self.title(underscore=True).encode('utf-8')) def exists(self): try: return self.get() and True except:return False def getReferences(self, follow_redirects=True, withTemplateInclusion=True, onlyTemplateInclusion=False, redirectsOnly=False): pass import pagegenerators gf = pagegenerators.GeneratorFactory() gf.limit=500 return gf.api({'list':'backlinks','bltitle':self.title(),}, 'bl') def linkedPages(self): try: return [Page(self.site(), m.group(1)) for m in re.finditer(r'\[\[([^][|<>#\n]+)[#|][^][\n]*\]\]', self.get())] except NoPage: raise except IsRedirectPage: raise except SectionError: return [] def isRedirectPage(self): try: return self.get() and False except IsRedirectPage: return True except: return False def title(self, underscore = False): if underscore: return self._title.replace(' ', '_') else: return self._title def titleWithoutNamespace(self, underscore=False): if self.namespace() == 0: return self.title(underscore=underscore) else: return self.title(underscore=underscore).split(':', 1)[1] def aslink(self, forceInterwiki=False): if forceInterwiki or self.site() != getSite(): if self.site().family != getSite().family: return u'[[%s:%s:%s]]' % (self.site().family.shortname, self.site().lang, self.title()) else: return u'[[%s:%s]]' % (self.site().lang, self.title()) else: return '[[%s]]' % self.title() def get(self, force=False, get_redirect=False, throttle=True, sysop=False, nofollow_redirects=False, change_edit_time=True): # FIXME add redirect following # all NoPage exception are contained here for convience for illegalChar in u'#<>[]|{}\n\ufffd': if illegalChar in self.title(): raise NoPage('Illegal character in %s!' % self.aslink()) if not self._title: raise NoPage("No title") if not hasattr(self, '_contents'):# or not self._contents or force: text = self._site.getUrl("%s?title=%s&action=%s"%(self.path(), self.urlname(), 'edit')) #uo = MyURLopener() #print('' % (self.protocol(), self.hostname(), self.path(), self.urlname(), 'edit')) #f = uo.open('%s://%s%s?title=%s&action=%s' % (self.protocol(), self.hostname(), self.path(), self.urlname(), 'edit')) charset = 'utf-8' #text = f.read() self._html = text if not '' in text: raise NoPage("No textarea found") elif not '
  • ' in text: raise NoPage(self.site(), self.aslink(forceInterwiki = True)) else: # get values for put() etBegin = text.find('" name="wpEdittime"') if etBegin > 0 and text[etBegin-15] == '"': self.wpEdittime = text[ etBegin-14:etBegin] else: self.wpEdittime = '' # FIXME implement namespace support correctly # Hack to get namespace after downloading nsBegin = text.find('wgNamespaceNumber=') if nsBegin > 0: self._namespace=int(text[nsBegin+len('wgNamespaceNumber='):text.index(',', nsBegin)]) # get revision ID # revision id rvBegin = text.find('wgCurRevisionId=') if rvBegin > 0: self.revisionid=int(text[rvBegin+len('wgCurRevisionId='):text.index(',', rvBegin+22)]) else: self.revisionid = None # Used to check against deletion timestamps stBegin = text.find('" name="wpStarttime"') if stBegin > 0 and text[stBegin-15] == '"': self.wpStarttime= text[ stBegin-14:stBegin] else: self.wpStarttime = self.wpEdittime or time.strftime('%Y%m%d%H%M%S') iBegin = text.index('>', text.index('') text = unescape(text[iBegin:iEnd]) self._contents = unicode( text, charset, errors='replace') try: # KEEP MAINTENANCE UP TO DATE import maintainer maintainer.update(self, self._contents) except ImportError: pass if '#REDIRECT' == self._contents.upper()[0:9]: #print '
    %s
    '%escape(self._contents.strip()) i1 = self._contents.find('[[') + 2 i2 = self._contents.find(']]', i1) self._redirarg = redirecttarget = self._contents[i1:i2] if get_redirect: pass elif not nofollow_redirects: raise IsRedirectPage, self._redirarg # Get the protection status if 'mw-protectedpagetext' in self._html: self.protection = "sysop" elif 'mw-semiprotectedpagetext' in self._html or 'semiprotectedpagewarning' in self._html: # They can't seem to mak up their mind on what to call it self.protection = "autoconfirmed" else: self.protection = None return self._contents def put_async(self, newtext, comment=None, watchArticle=None, minorEdit=True, force=False, callback=None): self.put(newtext, comment=comment, minorEdit=minorEdit) if callable(callback):callback() def put(self, newtext, comment=None, watchArticle=None, minorEdit=False, force=False): if not hasattr(self, '_html'): self.get(force=True) if comment is None: # Append using filename to edit summaries #comment = ('%s using [[tools:%s|%s]]' % (EditMsg.strip(), os.getenv('SCRIPT_URL')[1:], sys.argv[0][:-3])) comment = ('[[tools:~dispenser/view/Pywikipedia|%s]]: %s' % (sys.argv[0][2:-3], EditMsg.strip(), )) print('
    ' % (self.protocol(), self.hostname(), self.path(), self.urlname(), 'submit')) if self.protection == "sysop": print('

    This page is currently protected, and can be edited only by administrators.\n

    ') elif self.protection == "autoconfirmed": print('
    Warning: This page is currently semi-protected, and can be edited only by established registered users.
    ') else: pass print('') print('') print('' % self.wpStarttime) print('' % self.wpEdittime) print('') # IE8 XSS filter corrupts sent text print("""""" % (self.protocol(), self.hostname(), self.path(), self.urlname(), 'edit', self.title().encode('utf-8'), )) print('
    ') print('') print("" % comment.encode('utf-8')) print('
    ') # blank summary check print(' ' % ((minorEdit or 'minordefault=true' in os.getenv("HTTP_COOKIE", '')) and ' checked="checked"' or '')) print(' ' % ('watchdefault=false' not in os.getenv("HTTP_COOKIE", '') and ' checked="checked"' or '')) print('
    ') print(' ') print(' ') print(' ') print('
    ') print('') print('
    ') try: # Log import maintainer maintainer.processPage(self) except ImportError: pass def unescape(s): if '&' not in s: return s s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace("'", "'") s = s.replace(""", '"') s = s.replace("&", "&") # Must be last return s def escape(s): s = s.replace("&", "&") # Must be first s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace('"', '"') return s def removeDisabledParts(text, tags=['*']): regexes = { 'comments': r'', 'includeonly': r'.*?', 'nowiki': r'.*?', 'pre': r'
    .*?
    ', 'source': r'', 'gallery': r'', 'math': r'.*?', } if '*' in tags: tags = regexes.keys() return re.compile('|'.join([regexes[tag] for tag in tags]), re.IGNORECASE | re.DOTALL).sub('', text) def setAction(s): global EditMsg EditMsg = s def showDiff(old, new): import difflib print difflib.HtmlDiff().make_table(old.split('\n'), new.split('\n'), fromdesc="Current revision", todesc="Your text", context=True, numlines=1).replace(' ', ' ').replace('nowrap="nowrap"', 'class="diff_text"').encode('utf-8') def sectionencode(text, encoding='utf-8'): """Encode text so that it can be used as a section title in wiki-links.""" return urllib.quote(text.replace(" ","_").encode(encoding), safe=':').replace("%",".") # Note there a bug in pywikipedia, it include '/' as a safe charater ######### Unicode library ########### def UnicodeToAsciiHtml(s): return s.encode('ascii', 'xmlcharrefreplace') def url2unicode(title, site, site2 = None): return urllib.unquote(title) def unicode2html(x, encoding): try: x.encode(encoding) except UnicodeError: x = UnicodeToAsciiHtml(x) return x def html2unicode(t, ignore = []): t = unicode(t) start = t.find('&#', 0) while start != -1: end = t.find(';', start) if t[start+2] in ('X', 'x'): try: t=t[:start]+unichr(int(t[start+3:end], 16))+t[end+1:] except ValueError: start += 3 else: try: t=t[:start]+unichr(int(t[start+2:end]))+t[end+1:] except ValueError: start += 2 start = t.find('&#', start) #FIXME DoubleEscapeProblems &lt; if not ignore: t = unescape(t) # Decode xhtml elements if '&' in t and ';' in t: import htmlentitydefs for (name, codepoint) in htmlentitydefs.name2codepoint.iteritems(): if codepoint not in ignore: t = t.replace('&%s;'%name, unichr(codepoint)) return t def translate(code, dictText): # If a site is given instead of a code, use its language if hasattr(code,'lang'): code = code.lang return dictText.get(code, dictText.get('en')) def get_throttle(): return #TODO sort me def getCategoryLinks(text, site): return [] def removeCategoryLinks(text, site, marker = ''): return text def replaceCategoryInPlace(oldtext, oldcat, newcat, site=None): return oldtext def replaceCategoryLinks(oldtext, new, site=None): return oldtext def getLanguageLinks(text, insite = None, pageLink = "[[]]"): return {} def removeLanguageLinks(text, site = None, marker = ''): return text def replaceLanguageLinks(oldtext, new, site = None): return oldtext def handleArgs(): return [s and '-'+urllib.unquote(s).replace('=', ':', 1) for s in os.getenv("QUERY_STRING", '').split("&") if s] def getSite(code = None, fam = None, user=None, persistent_http=None): return MySite def stopme(): pass def showHelp(moduleName = None): moduleName = moduleName or sys.argv[0][2:sys.argv[0].rindex('.')] exec('import %s as module' % moduleName) helpText = module.__doc__ if hasattr(module, 'docuReplacements'): for key, value in module.docuReplacements.iteritems(): helpText = helpText.replace(key, value.strip('\n\r')) print('
    Information or features may not work in HTML mode.
    ') print('
    %s
    ' % escape(helpText.strip())) def inputChoice(question, answers, hotkeys, default = None): output('%s [%s] %s'%(question, ', '.join(answers), default)) return default.lower() def input(text): output(text) return '' def output(s='', decoder = None, newline = True, toStdout = False): print re.sub(r'\[\[([^[\]{|}\n]*?)\]\]', r'[[\1]]'%MySite.hostname(), re.sub(r'\03\{(light|)([^{}]*)\}(.*?)\03\{default\}', r'\3', escape('%s\n'%s).replace('\n', "
    \n"))).encode('utf-8') #print re.sub(r'\03\{(light|)([^{}]*)\}(.*?)\03\{default\}', r'\3', escape('%s\n'%s).replace('\n', "
    \n")).encode('utf-8') # Local exceptions class Error(Exception): """Wikipedia error""" class NoPage(Error): """Page does not exist""" class EditConflict(Error): """There has been an edit conflict while uploading the page""" class NoSuchSite(Error): """Site does not exist""" class IsRedirectPage(Error): """Page is a redirect page""" class ServerError(Error): """Got unexpected server response""" class BadTitle(Error): """Server responded with BadTitle.""" class LockedPage(Error): """Page is locked""" class SectionError(Error): """The section specified by # does not exist""" # Web pywikipdia specific APIs def textbox(name, value, label = None, attrib=''): print '' % (name, label or '%s: ' % name.capitalize(), name, value, name, attrib) def checkbox(name, checked, label = None, attr=''): print '' % (name, name, checked and ' checked="checked"', attr, name, label or name.capitalize()) def startContent(title=None, form = False, notice="", page = None, submitLabel = '↵', head=None): """ More or less a work in progress title - Application title page - Processed page's title notice - Application notice, displays above the title form - bool display input form submitLabel - label for submit button on form defaults to cartage return symbol """ if not page: page = MyPage if not title: title = sys.argv[0] title = title[title.rfind('/')+1:].capitalize() if page.title(): title += ' - %s'%page.title() print '' print '' print open('./text/head.html').read().replace('$1', title.encode('utf-8')).replace('$2', head or '') print """""" print '

    %s

    ' % title.encode('utf-8') print """\
    """ if notice: print notice.encode('utf-8') else: try: print open('./text/notice.html', 'r').read() except IOError: pass try: notice = open('/var/www/sitenotice', 'r').read() if notice: print '
    %s
    '%notice except IOError: pass print '
    ' print '
    ' if page.title(): # Make some bookmarklets work site = page.site() print """"""%(site.path(), site.protocol()+"://"+site.hostname(), site.namespace(page.namespace()).encode('utf-8'), page.namespace(), page.title(underscore=True).encode('utf-8'), page.titleWithoutNamespace().encode('utf-8'), 'null', site.language(), site.language(), 'null', site.dbName()) if any([prgm in sys.argv[0] for prgm in ('reflink', 'dablink', 'checklink')]): print ''' ''' % { 'tool': 'eflink' in sys.argv[0] and 'Reflinks' or 'ablinks' in sys.argv[0] and 'Dablinks' or 'Checklinks', } #{'reflinks':'Dispenser/Reflinks','checklinks':'Dispenser/Checklinks'}.get('reflinks','Dispenser') else: print '' print '' if form: print '
    ' print '%s -page:' % sys.argv[0] print '' % escape(page.aslink()[2:-2].encode('utf-8')) print '
    ' % submitLabel def endContent(replag=None, *notes): replag = int((replag or 0) / 60) print """
    """ print "" % time.clock() print open('./text/nav.html').read() print open('./text/footer.html').read().replace("$1", '
    '.join(notes + ('Page generated in %#4.2f seconds, CPU time: %#3.2f seconds%s' % (time.time()-startTime, time.clock(), replag and ", database replication lag is %d minutes" % replag or ''),))) print '' print '' def handleUrlAndHeader(connicalize=True, allowBots=False): """ Redirects the URL to a better format and prints an HTTP header """ if connicalize: redirect = os.getenv("REQUEST_URI", '').replace('%20', '_').replace('+', '_').rstrip('_') else: redirect = os.getenv("REQUEST_URI", '') # prog.py/pagename -> prog.py?page=pagename if os.getenv("PATH_INFO", ''): redirect = os.getenv("SCRIPT_NAME", '') + "?page=" + os.getenv("PATH_INFO", '/').replace('&', '%26').replace('+', '%2B')[1:] if not allowBots and '?' in redirect and any(s in os.getenv("HTTP_USER_AGENT", '').lower() for s in ('crawler', 'http://', 'https://', 'robot', 'spider', )): # Prevent Google, Yahoo, MSN, and other bots from indexing pages dynamicly generated pages print 'Status: 403' print 'Content-Type: text/html; charset=utf-8' print print """ 403 Forbidden

    Forbidden

    Crawling dynamicly generated pages is not allowed


    Requesting agent: %s
    """ % os.getenv("HTTP_USER_AGENT", '') sys.exit() elif redirect != os.getenv("REQUEST_URI", ''): # http://turbo-technical-report.blogspot.com/2006/11/server-side-301-302-http-response.html # Saved me quite a bit of trouble. Thanks! print "Status: 301" print 'Location: ' + redirect.replace('%7E', '~') print 'Content-Type: text/html; charset=utf-8' print print """ 301 Moved Permanently

    Moved Permanently

    The document has moved here.

    """ % redirect return False elif os.getenv("REQUEST_METHOD", "GET") not in ('GET', 'POST'): print 'Status: 405 Method not allowed' print print "Method is not allowed, only GET and POST methods are accepted" return False else: #print 'Content-Type: application/xhtml+xml; charset=utf-8' print 'Content-Type: text/html; charset=utf-8' print return True # Setup the default values MySite = Site(([s for s in handleArgs() if s.startswith('-lang:')] or ['-lang:en']).pop()[6:] or 'en', Family(([s for s in handleArgs() if s.startswith('-family:')] or ['-family:w']).pop()[8:] or '-family:w')) MyPage = Page(MySite, ([s for s in handleArgs() if s.startswith('-page:')] or ['-page:']).pop()[6:]) #Disable since it breaks tools which assume English default #MySite = MyPage.site() setAction('Web wikipedia python library') class MyURLopener(urllib.FancyURLopener): version="WebPythonWikipedia/1.0" # functions to manipulate wikitext strings (by default, all text arguments # should be Unicode) # All return the modified text as a unicode object def replaceExcept(text, old, new, exceptions, caseInsensitive=False, allowoverlap=False, marker = '', site = None): """ Return text with 'old' replaced by 'new', ignoring specified types of text. Skips occurences of 'old' within exceptions; e.g., within nowiki tags or HTML comments. If caseInsensitive is true, then use case insensitive regex matching. If allowoverlap is true, overlapping occurences are all replaced (watch out when using this, it might lead to infinite loops!). Parameters: text - a unicode string old - a compiled regular expression new - a unicode string (which can contain regular expression references), or a function which takes a match object as parameter. See parameter repl of re.sub(). exceptions - a list of strings which signal what to leave out, e.g. ['math', 'table', 'template'] caseInsensitive - a boolean marker - a string that will be added to the last replacement; if nothing is changed, it is added at the end """ exceptionRegexes = { 'comment': re.compile(r'(?s)'), 'includeonly': re.compile(r'(?is).*?'), 'math': re.compile(r'(?is).*?'), 'noinclude': re.compile(r'(?is).*?'), # wiki tags are ignored inside nowiki tags. 'nowiki': re.compile(r'(?is).*?'), # preformatted text 'pre': re.compile(r'(?ism)
    .*?
    '), 'source': re.compile(r'(?is)'), # inline references 'ref': re.compile(r'(?ism)].*?'), 'timeline': re.compile(r'(?is).*?'), # lines that start with a space are shown in a monospace font and # have whitespace preserved. 'startspace': re.compile(r'(?m)^ (.*?)$'), # tables often have whitespace that is used to improve wiki # source code readability. # TODO: handle nested tables. 'table': re.compile(r'(?ims)^{\|.*?^\|}|.*?
    '), # templates with parameters often have whitespace that is used to # improve wiki source code readability. # 'template': re.compile(r'(?s){{.*?}}'), # The regex above fails on nested templates. This regex can handle # templates cascaded up to level 3, but no deeper. For arbitrary # depth, we'd need recursion which can't be done in Python's re. # After all, the language of correct parenthesis words is not regular. 'template': re.compile(r'(?s){{(({{(({{.*?}})|.)*}})|.)*}}'), 'hyperlink': re.compile(r'(?Phttp[s]?://[^\]\s<>]*?[^\]\s\)\.:;,<>"](?=[\]\s\)\.:;,<>"]*\'\')|http[s]?://[^\]\s<>"]*[^\]\s\)\.:;,<>"])'), 'gallery': re.compile(r'(?is).*?'), # this matches internal wikilinks, but also interwiki, categories, and # images. 'link': re.compile(r'\[\[(?P[^\]\|]*)(\|[^\]]*)?\]\]') } # if we got a string, compile it as a regular expression if type(old) == type('') or type(old) == type(u''): if caseInsensitive: old = re.compile(old, re.IGNORECASE | re.UNICODE) else: old = re.compile(old) dontTouchRegexes = [] for exc in exceptions: if isinstance(exc, str) or isinstance(exc, unicode): # assume it's a reference to the exceptionRegexes dictionary # defined above. if not exceptionRegexes.has_key(exc): raise ValueError("Unknown tag type: " + exc) dontTouchRegexes.append(exceptionRegexes[exc]) else: # assume it's a regular expression dontTouchRegexes.append(exc) index = 0 markerpos = len(text) while index < len(text): match = old.search(text, index) if not match: # nothing left to replace break # check which exception will occur next. nextExceptionMatch = None for dontTouchR in dontTouchRegexes: excMatch = dontTouchR.search(text, index) if excMatch and ( nextExceptionMatch is None or excMatch.start() < nextExceptionMatch.start()): nextExceptionMatch = excMatch if nextExceptionMatch is not None and nextExceptionMatch.start() <= match.start(): # an HTML comment or text in nowiki tags stands before the next valid match. Skip. index = nextExceptionMatch.end() else: # We found a valid match. Replace it. if callable(new): # the parameter new can be a function which takes the match as a parameter. replacement = new(match) else: # it is not a function, but a string. # it is a little hack to make \n work. It would be better to fix it # previously, but better than nothing. new = new.replace('\\n', '\n') # We cannot just insert the new string, as it may contain regex # group references such as \2 or \g<name>. # On the other hand, this approach does not work because it can't # handle lookahead or lookbehind (see bug #1731008): #replacement = old.sub(new, text[match.start():match.end()]) #text = text[:match.start()] + replacement + text[match.end():] # So we have to process the group references manually. replacement = new groupR = re.compile(r'\\(?P<number>\d+)|\\g<(?P<name>.+?)>') while True: groupMatch = groupR.search(replacement) if not groupMatch: break groupID = groupMatch.group('name') or int(groupMatch.group('number')) replacement = replacement[:groupMatch.start()] + match.group(groupID) + replacement[groupMatch.end():] text = text[:match.start()] + replacement + text[match.end():] # continue the search on the remaining text if allowoverlap: index = match.start() + 1 else: index = match.start() + len(replacement) markerpos = match.start() + len(replacement) text = text[:markerpos] + marker + text[markerpos:] return text def isDisabled(text, index, tags = ['*']): """ Return True if text[index] is disabled, e.g. by a comment or by nowiki tags. For the tags parameter, see removeDisabledParts() above. """ # Find a marker that is not already in the text. marker = '@@' while marker in text: marker += '@' text = text[:index] + marker + text[index:] text = removeDisabledParts(text, tags) return (marker not in text)