# -*- coding: utf-8 -*- """ wikiparser.py MediaWiki text to HTML processor Dispenser 2008 """ import re from htmlentitydefs import name2codepoint import cgitb; cgitb.enable() autonumber = 0 refindex = 1 safe_map = {} monthname = { '01': 'January', '02': 'February', '03': 'March', '04': 'April', '05': 'May', '06': 'June', '07': 'July', '08': 'August', '09': 'September', '10': 'October', '11': 'November', '12': 'December', } def parser(text, hostname=None, allowComments=False, allowHtml=True, docroot=None, sanitize=True): if not docroot: docroot = "http://%s/wiki/" % (hostname or "en.wikipedia.org") # Nowiki if '(.*?)', text): text = text.replace(m.group(), nowiki(m.group(1))) # if '[^/]*?)>(?P.*?)', text): named = re.search(' name\s*=\s*(?P[\'"]?)(?P.+)(?P=quote)', match.group('params')) if named: dict[named.group('name')] = refindex name = named.group('name') else: name = refindex text = text.replace(match.group(), '[%d]'%(name, name, refindex), 1) reflist.append('
  • '%(name, name) + match.group('content') + '
  • ') refindex += 1 for match in re.finditer(r'(?is)[^/>]*?)/>', text): named = re.search(' name\s*=\s*(?P[\'"]?)(?P.+)(?P=quote)', match.group('params')) try: name = named.group('name').strip() text = text.replace(match.group(), '[%d]'%(name,dict[name])) except: pass text = re.sub('', '
      '+('\n'.join(reflist))+'\n
    ', text, 1) # Disable HTML if not allowHtml: text = text.replace('<', '<').replace('>', '>') else: #
     blocks
    		if '
    ' in text: for m in re.finditer(r'(?is)
    (.*?)
    ', text): text = text.replace(m.group(), "
    %s
    "%nowiki(m.group(1).strip())) # sanitize remaining HTML if sanitize: for m in re.finditer(r'', r'', text.expandtabs()).strip() # #REDIRECT img rendering if '#REDIRECT' == text[:9]: text = re.sub(r'^#REDIRECT *(.*)', r'#REDIRECT \1', text) # if '<math' in text: text = re.sub(r'(?is)<math>(.*?)</math>', r'Math tag', text) # Template (Fake) if '{{' in text: text = text.replace(r'{{FA-star}}', r'Featured article') text = re.sub(r'(?s)\{\{\s*([^][{|}]+?)\s*(|\|[^{}]*)\}\}', r'{{[[Template:\1|\1]]}}', text) # Tables if '\n|}' in text: stack = [text.find('{|')] while 20 > len(stack) > 0: start = stack.pop() if start == -1: continue end = text.find('|}', start+2) nextStart = text.find('{|', start+2) if end > nextStart > start: stack.append(start) stack.append(nextStart) elif end > start > nextStart or nextStart > end > start: table = text[start:end+2] table = re.sub(r'(?m)^\s*\{\| *(.*)', r'', table) table = table.replace('\n|}', '\n
    ') table = re.sub(r'(?m)^\s*\|-+ *(.*)', r'', table) table = re.sub(r'(?m)^\s*\|\+ *([^][\n|]*\||)(.*)', r'\2', table) #while re.findall(r'(?m)^([!|])(.*?)(\|\||!!)', table): for r in re.finditer(r'(?m)^([!|])(.*?)(\|\||!!)', table): table = table.replace(r.group(), r.group(1)+r.group(2)+'\n'+r.group(1)) table = re.sub(r'(?m)^\! *([^][<>\n|]*(?=\|)|)\|* *(.*)', r'\2', table) table = re.sub(r'(?m)^\| *([^][<>\n|]*(?=\|)|)\|* *(.*)', r'\2', table) text = text[:start] + table + text[end+2:] if stack == [] or nextStart > end: stack.append(text.find('{|', start+2)) elif end < 2: print '' break else: print '' raise '%r'%stack if len(stack) >= 20: print '' # remove whitespace text = text.replace(' >', '>') # Optimize s text = text.replace('', '') text = text.replace('\n', '') # Headers if '\n=' in text: for m in re.finditer(r'(?m)^(?P=+)(?P.+?)(?P=level)?\s*$', text): lvl = len(m.group('level')) t = m.group('head').strip() anchor=escapeId(t) # char ref for ¶ since string maybe byte or unicode text = text.replace(m.group(), '%s\n\n' % (lvl, anchor, t, anchor, lvl), 1) # Lists and paragraphing if '\n' in text: # space infront to
     (simple ver.)
    		text = re.sub(r'\n ([^\n]*)', r'\n
    \n\1\n
    ', text) text = text.replace('\n
    \n
    ', '')
    
    		while re.search('(?m)^[#*]', text):
    			# bullet and numbered lists
    			text = re.sub(r"(?m)^\* *(.*)", r'
      \n
    • \n\1
    • \n
    ', text) text = text.replace('\n
      \n', '') text = text.replace('\n
    • \n
        ', '\n
          ') text = text.replace('\n
        ', '\n
    • ') # Numbered lists text = re.sub(r"(?m)^\# *(.*)", r'
        \n
      1. \n\1
      2. \n
      ', text) text = text.replace('\n
        \n', '') text = text.replace('\n
      1. \n
          ', '\n
            ') text = text.replace('\n
          ', '\n
      2. ') # definition lists text = re.sub(r'(?m)^;([^\n\r:]*) *', r'
        \n
        \n\1
        \n
        \n', text) text = re.sub(r'(?m)^:(.*)', r'
        \n
        \n\1
        \n
        \n', text) text = text.replace('\n
        \n', '') # regex as a regex that can handle both dl/ul/ol text = text.replace('
        \n
        ', '\n
        ') text = text.replace('
        \n
        ', '\n') #text = text.replace('
      3. \n', '
      4. ') text = text.replace('
        \n', '
        ') # Horizonal line text = re.sub(r"(?m)^----+ *", '
        \n', text) # New paragraphs text = re.sub(r"(?<=\n\n)(([^<>\n][^\n]*\n)+)", r'

        \1

        ', text) text = text.replace('\n

        ', '

        ') # Bold and Illatics if '\'' in text: text = re.sub(r"(?m)'''(.*?'*)'''", r'\1', text) text = re.sub(r"(?m)''(.*?'*)''", r'\1', text) # Encode & -> & if '&' in text: text = text.replace('&', '&') #for m in re.finditer(r'&(\w{2,8});', text): # text = text.replace(m.group(), '&#%d;'%name2codepoint[m.group(1)]) for (name, codepoint) in name2codepoint.iteritems(): text = text.replace('&%s;'%name, '&#%d;'%codepoint) text = re.sub(r'\&#(\d+|[xX][0-9A-Fa-f]+);', r'&#\1;', text) # Internal links if '[[' in text: # Convert ISO dates for l in re.finditer(r'\[\[(\d{4})-(0[1-9]|1[012])-([0-3]\d)\]\]', text): text = text.replace(l.group(), "[[%(month)s %(day)s]], [[%(year)s]]" % {'year':l.group(1), 'month':monthname[l.group(2)], 'day':l.group(3), }) #text = text.replace(l.group(), "%(day)s %(month)s %(year)s" % {'year':l.group(1), 'month':monthname[l.group(2)], 'day':l.group(3), }) for l in re.finditer(r'\[\[([^][{|}]+)(?:\|([^][]*)|)\]\](\w*)', text): link = l.group(1) if link.startswith(':'): link = link[1:] for illegalChar in '<>[]|{}\n': if illegalChar in link: break else: text = text.replace(l.group(), '%s' % (docroot, link.replace(' ', '_').strip('_:'), link, (l.group(2) or link)+l.group(3))) # External links if '://' in text: def createLink(m): if m.group(3): return m.expand(r'\3') elif m.group(2): return m.expand(r'\2') else: global autonumber autonumber += 1 return m.expand(r'[%s]'%autonumber) text = re.sub(r'\[(\w+://[^][<>"\s]+) *((?<= ).*?]*)?\]|(?"\s]+)(?=[^<\n>]*<)', createLink, text) # Images ''' for wimg in re.finditer(r'(?s)\[\[Image:([^\[|\]]+)(\|.*?|)\]\]', text): file = 'Image:%s' % wimg.group(1).replace('_', ' ').strip() attribs = wimg.group(2).split('|') size = '' float = None thumb = False caption = '' captionText = '' for s in wimg.group(2).split('|'): sl = s.lower().strip() if sl.endswith('px'): try: size = (int(s[:-2])*64/250) except: pass elif sl=='left' or sl=='right': float = s elif sl=='thumb' or sl=='thumbnail': thumb = True size = size or 64 float = float or 'right' else: caption = s captionText = wikipedia.escape(re.sub(r']*>', '', s)) if thumb: text = text.replace(wimg.group(), """
        %s
        %s
        """ % (float, size+2, basehref + file.replace(' ', '_'), file, captionText, size, caption )) else: text = text.replace(wimg.group(), '%s' % (basehref, file, docroot, file, size, float, captionText)) ### ''' #Hack text = showText(text) return text def cleanAttribs(s): return s def escapeId(s): return escapeUrl(s).replace('%', '.') def escapeUrl(s): if not safe_map: # generate when first used safe = '-.0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz' for i in range(256): c = chr(i) safe_map[c] = (c in safe) and c or ('.%02X' % i) try: res = map(safe_map.__getitem__, s.replace(' ', '_')) except: res = [] return ''.join(res) def nowiki(s): nowiki_map = {} for i in range(256): c = chr(i) nowiki_map[c] = (c not in '{}[]<>\':;*#') and c or ('&#%d;'%i) try: res = map(nowiki_map.__getitem__, s) except: res = [] return ''.join(res) ## hideTokens = {} hideRegex = re.compile('|'.join([ r'', r'', r'', r'', r'.*?', ]), re.I | re.S) def hideText(text): global hideTokens n=111 for m in hideRegex.finditer(text): n+=1 hideTokens[n] = m.group() text = text.replace(m.group(), '⌊⌊⌊⌊%06d⌋⌋⌋⌋'%n) return text def showText(text): global hideTokens for (key, value) in hideTokens.items(): text = text.replace('⌊⌊⌊⌊%06d⌋⌋⌋⌋'%key, value) if re.search(r'⌊⌊⌊⌊\d{6,}⌋⌋⌋⌋', text): print("WARNING: Unable to replace all hidden tokens") raise "Please report this problem at [[User talk:Dispenser]]" hideTokens = {} # Empty return text