#!/usr/bin/python # -*- coding: utf-8 -*- # TODO: # add per paragraph readability scores """ Readabilty.py (c) Dispenser, 2007 """ import re, htmlentitydefs, wikipedia import parser from math import sqrt import cgi import cgitb; cgitb.enable() textFiles = './text/' ArticleWordStub = { 'en': "This article seems to be a stub." } ArticleWordLarge = { 'en': "This article seems to be too long and probably needs to be split." } ArticleLowIndex = { 'en': "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy." } ArticleHighIndex = { 'en': "This article seems to have too many long words and sentences for even most university graduates to easily read and understand." } ArticleSizeLarge = { 'en': "Almost certainly should be divided up" } ArticleSizeBig = { 'en': "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)" } Languages = { 'en': "English", 'simple': "Simple English", } Families = { 'wikibooks': "Wikibooks", 'wikipedia': "Wikipedia", 'wikisource': "Wikisource", 'wikinews': "Wikinews", 'wiktionary': "Wiktionary", 'wikiquote': "Wikiquote", 'wikiversity': "Wikiversity", } BracketCount = 0 # How AWB does it # Remove tables and templates # Count instances using r"[a-zA-Z]+" flags = re.UNICODE WordCount = re.compile(r'[^\W\d]+', flags) #should include \- SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', flags) class wikiParse: a = 0 def printu(ustr): try: print ustr except: print ustr.encode('utf-8') def printOptions(dict, default): for key in dict: if key == default: print '' % (key, dict[key]) else: print '' % (key, dict[key]) def printWiki(text, basehref=None, linkClass='extiw'): print(wikiToHTML(text, basehref, linkClass).encode('utf-8')) def wikiLink(text): for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text): link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_') text = text.replace(m.group(), r'%s' % (hostname, link, m.group(2))) return text def wikiToHTML(text, basehref = None, linkClass = 'extiw'): # Define varibles if not basehref: basehref = 'http://%s/wiki/' % hostname if linkClass: linkClass = ' class="%s"' % linkClass text = parser.parser(text, docroot=basehref, allowHtml=True, sanitize=True) # Images for wimg in re.finditer(r'(?s)\[\[Image:([^\[|\]]+)(\|.*?|)\]\]', text): file = 'Image:%s' % wimg.group(1).replace('_', ' ').strip() attribs = wimg.group(2).split('|') size = '' float = None thumb = False caption = '' captionText = '' for s in wimg.group(2).split('|'): sl = s.lower().strip() if sl.endswith('px'): try: size = (int(s[:-2])*64/250) except: pass elif sl=='left' or sl=='right': float = s elif sl=='thumb' or sl=='thumbnail': thumb = True size = size or 64 float = float or 'right' else: caption = s captionText = wikipedia.escape(re.sub(r']*>', '', s)) if thumb: text = text.replace(wimg.group(), """
%s
%s
""" % (float, size+2, hostname, file.replace(' ', '_'), file, captionText, size, caption )) else: text = text.replace(wimg.group(), '%s' % (basehref, file, hostname, file, size, float, captionText)) return text # Strip HTML comments text = re.sub(r"(?s)", r'', text) text = text.expandtabs() # Nowiki # Nowikicode goes here # Tempate parser stack = [text.find('{{')] while 10 > len(stack) > 0: start = stack.pop() if start == -1: continue end = text.find('}}', start)+2 nextStart = text.find('{{', start+2) if end > nextStart > start: stack.append(start) stack.append(nextStart) elif end > start: tmpl = text[start:end] [(title, param)] = re.findall(r'\{\{[{}]*\s*([^{|}\n]+)(.*?)', tmpl) text = text.replace(tmpl, '{{[[Template:%s|%s]]}}' % (title, title)) if stack == []: stack.append(text.find('{{', start+2)) # , text = re.sub(r'(?is)(.*?)', r'Math tag', text) # text = text.replace('') # Headers text = re.sub(r'(?m)^=====(.*?)===== *$',r'
\1
', text) text = re.sub(r'(?m)^====(.*?)==== *$', r'

\1

', text) text = re.sub(r'(?m)^===(.*?)=== *$', r'

\1

', text) text = re.sub(r'(?m)^==(.*?)== *$', r'

\1

', text) text = re.sub(r'(?m)^=(.*?)= *$', r'

\1

', text) # definition lists text = re.sub(r"(?m)^;([^:]*\S)\s*", r'
\n
\1
\n
\n', text) text = re.sub(r"(?m)^:(.*)\s*", r'
\n
\1
\n
\n', text) text = text.replace(r'\n\n
', r'') # bullet and numbered lists text = re.sub(r"(?m)^ ?[*#]*\* *(.*)", r'', text) text = re.sub(r"(?m)^ ?[*#]*\# *(.*)", r'
    \n\t
  1. \1
  2. \n
', text) # Merge list together text = text.replace("\n