#!/usr/bin/python # -*- coding: utf-8 -*- # TODO: add per paragraph readability scores """ Readabilty.py (c) Dispenser, 2007 """ import re, htmlentitydefs, wikipedia from math import sqrt import cgi import cgitb; cgitb.enable() textFiles = './text/' ArticleWordStub = { 'en': "This article seems to be a stub." } ArticleWordLarge = { 'en': "This article seems to be too long and probably needs to be split." } ArticleLowIndex = { 'en': "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy." } ArticleHighIndex = { 'en': "This article seems to have too many long words and sentences for even most university graduates to easily read and understand." } ArticleSizeLarge = { 'en': "Almost certainly should be divided up" } ArticleSizeBig = { 'en': "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)" } Languages = { 'en': "English", 'simple': "Simple English", } Families = { 'wikibooks': "Wikibooks", 'wikipedia': "Wikipedia", 'wikisource': "Wikisource", 'wikinews': "Wikinews", 'wiktionary': "Wiktionary", 'wikiquote': "Wikiquote", 'wikiversity': "Wikiversity", } BracketCount = 0 class wikiParse: a = 0 def printu(ustr): try: print ustr except: print ustr.encode('utf-8') def printOptions(dict, default): for key in dict: if key == default: print '' % (key, dict[key]) else: print '' % (key, dict[key]) def printWiki(text, basehref=None, linkClass='extiw'): print(wikiToHTML(text, basehref, linkClass).encode('utf-8')) def wikiLink(text): for m in re.finditer(r'\[\[(.*?)\]\]', text): text = text.replace(m.group(), r'%s' % (hostname, m.group(1).replace(' ', '_'), m.group(1))) return text def wikiToHTML(text, basehref = None, linkClass = 'extiw'): # Define varibles if not basehref: basehref = 'http://%s/wiki/' % hostname if linkClass: linkClass = ' class="%s"' % linkClass # Strip HTML comments text = re.sub(r"(?s)", r'', text) text = text.expandtabs() # Nowiki # Nowikicode goes here # Tempate parser stack = [text.find('{{')] while 10 > len(stack) > 0: start = stack.pop() if start == -1: continue end = text.find('}}', start)+2 nextStart = text.find('{{', start+2) if end > nextStart > start: stack.append(start) stack.append(nextStart) elif end > start: tmpl = text[start:end] [(title, param)] = re.findall(r'\{\{[{}]*\s*([^{|}\n]+)(.*?)', tmpl) text = text.replace(tmpl, '{{[[Template:%s|%s]]}}' % (title, title)) if stack == []: stack.append(text.find('{{', start+2)) # , text = re.sub(r'(?is)(.*?)', r'Math tag', text) # text = text.replace('') # Headers text = re.sub(r'(?m)^=====(.*?)===== *$',r'
\1
', text) text = re.sub(r'(?m)^====(.*?)==== *$', r'

\1

', text) text = re.sub(r'(?m)^===(.*?)=== *$', r'

\1

', text) text = re.sub(r'(?m)^==(.*?)== *$', r'

\1

', text) text = re.sub(r'(?m)^=(.*?)= *$', r'

\1

', text) # definition lists text = re.sub(r"(?m)^;([^:]*\S)\s*", r'
\n
\1
\n
\n', text) text = re.sub(r"(?m)^:(.*)\s*", r'
\n
\1
\n
\n', text) text = text.replace(r'\n\n
', r'') # bullet and numbered lists text = re.sub(r"(?m)^ ?[*#]*\* *(.*)", r'', text) text = re.sub(r"(?m)^ ?[*#]*\# *(.*)", r'
    \n\t
  1. \1
  2. \n
', text) # Merge list together text = text.replace("\n