#!/usr/bin/python # -*- coding: utf-8 -*- """ Readabilty.py v1.4 """ # (c) 2008 - Dispenser # # TODO: # add per paragraph readability scores, width = readability, height=amount of text parsed, seprated by sections import re, math import wikipedia, pagegenerators import cgitb; cgitb.enable() try: import parser except ImportError: parser = None try: import syllable except ImportError: vowels_R = re.compile(r'[aeiouy]+') class syllable: def syllable(self, word): if word.isdigit(): return len(word) if not word.isalpha(): return 0 # Strip endings if word.endswith('e'): word = word[:-1] elif word.endswith('es') or word.endswith('ed'): word = word[:-2] elif word.endswith('ing') or word.endswith('ies'): word = word[:-3] elif word.endswith('ely'): word = word[:-3]+word[-2:] # Count vowels return len(vowels_R.findall(word)) or 1 # How AWB does it # Remove tables and templates # Count instances using r"[a-zA-Z]+" flags = re.UNICODE WordCount = re.compile(r'[^\W\d]+', flags) #should include \- SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', flags) def ages(grade): # FIXME document where the 5.3 comes from return "%d/%d" % (grade+4.8, grade+5.8) # 5.3 +/- .5 def wikiLink(text, linkClass='extiw'): for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text): link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_') text = text.replace(m.group(), r'%s' % ('en.wikipedia.org', link, linkClass, m.group(2),)) return text def rowPrint(*cells): print '' for cell in cells: print('%s' % wikiLink(str(cell))) print '' def printWiki(text, docroot=None, linkClass='extiw'): if parser: print parser.parser(text, docroot=docroot or 'http://%s/wiki/'%'en.wikipedia.org', allowHtml=True, sanitize=True).encode('utf-8') else: wikipedia.output(text) def divne(x, y): " Divide, no expections/errors " return float(x)/float(y or 1) def removeWikiContainers(text): text = wikipedia.removeDisabledParts(text) print '' # Remove links while re.search(r'\[\[[^][]+\]\]', text): text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE) text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL) text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL) print '' # Remove templates # Tempate striper stack = [text.find('{{')] while 10 > len(stack) > 0: start = stack.pop() if start == -1: continue end = text.find('}}', start)+2 nextStart = text.find('{{', start+2) if end > nextStart > start: stack.append(start) stack.append(nextStart) elif end > start: text = text[:start] + text[end:] if stack == []: stack.append(text.find('{{', start)) print '' # Remove italics and bolding text = text.replace("'''", "") text = text.replace("''", "") # Remove External links text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text) return text def htmlStats(page, date=None, oldid=None): try: # TODO implement date or oldid wikitext = page.get() except wikipedia.IsRedirectPage: print Wikipedia.Page(page.site(), page._redirarg).aslink() return if not wikitext.strip(): wikipedia.output('Page does not exist') return elif '{{disambig' in wikitext: printWiki(wikitext, docroot='readability.py?page=%s:' % page.site().language(), linkClass = '') return plaintext = removeWikiContainers(wikitext) import subprocess p = subprocess.Popen( ["/home/dispenser/bin/style"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) p.stdin.write(plaintext.encode('utf-8')) print '
'
	for aline in p.communicate()[0].splitlines():
	 	print aline#.split(': ')
	print '
' print '
' # Whitespace hack # see http://developer.mozilla.org/en/Whitespace_in_the_DOM print '''Show parsed text' def main(): global debug debug = False genFactory = pagegenerators.GeneratorFactory() site = wikipedia.getSite() generator = None page = wikipedia.Page(site, '') format = 'html' for arg in wikipedia.handleArgs(): if arg.startswith('-debug'): debug=True elif arg.startswith('-format:'): format = arg[8:] print "Not implemented" elif arg.startswith('-oldid:'): oldid= arg[7:] elif arg.startswith('-targetdate:'): print "Not implemented" offset = arg[12:] else: generator = genFactory.handleArg(arg) # Start page wikipedia.startContent(u'Readability: %s' % page.title(), form=False) print '
' print '' % page.aslink()[2:-2].encode('utf-8') print '' print '' print '
' if not generator: # syntax error, show help text from the top of this file wikipedia.showHelp('readability14') return else: for page in generator: htmlStats(page) if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: main() finally: wikipedia.endContent() wikipedia.stopme()