#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Readabilty.py v1.4
"""
# (c) 2008 - Dispenser
#
# TODO:
# add per paragraph readability scores, width = readability, height=amount of text parsed, seprated by sections
import re, math
import wikipedia, pagegenerators
import cgitb; cgitb.enable()
try:
import parser
except ImportError:
parser = None
try:
import syllable
except ImportError:
vowels_R = re.compile(r'[aeiouy]+')
class syllable:
def syllable(self, word):
if word.isdigit():
return len(word)
if not word.isalpha():
return 0
# Strip endings
if word.endswith('e'):
word = word[:-1]
elif word.endswith('es') or word.endswith('ed'):
word = word[:-2]
elif word.endswith('ing') or word.endswith('ies'):
word = word[:-3]
elif word.endswith('ely'):
word = word[:-3]+word[-2:]
# Count vowels
return len(vowels_R.findall(word)) or 1
# How AWB does it
# Remove tables and templates
# Count instances using r"[a-zA-Z]+"
flags = re.UNICODE
WordCount = re.compile(r'[^\W\d]+', flags) #should include \-
SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', flags)
def ages(grade):
# FIXME document where the 5.3 comes from
return "%d/%d" % (grade+4.8, grade+5.8) # 5.3 +/- .5
def wikiLink(text, linkClass='extiw'):
for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text):
link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_')
text = text.replace(m.group(), r'%s' % ('en.wikipedia.org', link, linkClass, m.group(2),))
return text
def rowPrint(*cells):
print '
'
for cell in cells:
print('| %s | ' % wikiLink(str(cell)))
print '
'
def printWiki(text, docroot=None, linkClass='extiw'):
if parser:
print parser.parser(text, docroot=docroot or 'http://%s/wiki/'%'en.wikipedia.org', allowHtml=True, sanitize=True).encode('utf-8')
else:
wikipedia.output(text)
def divne(x, y):
" Divide, no expections/errors "
return float(x)/float(y or 1)
def removeWikiContainers(text):
text = wikipedia.removeDisabledParts(text)
print ''
# Remove links
while re.search(r'\[\[[^][]+\]\]', text):
text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE)
text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL)
text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL)
print ''
# Remove templates
# Tempate striper
stack = [text.find('{{')]
while 10 > len(stack) > 0:
start = stack.pop()
if start == -1: continue
end = text.find('}}', start)+2
nextStart = text.find('{{', start+2)
if end > nextStart > start:
stack.append(start)
stack.append(nextStart)
elif end > start:
text = text[:start] + text[end:]
if stack == []:
stack.append(text.find('{{', start))
print ''
# Remove italics and bolding
text = text.replace("'''", "")
text = text.replace("''", "")
# Remove External links
text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text)
return text
def htmlStats(page, date=None, oldid=None):
try:
# TODO implement date or oldid
wikitext = page.get()
except wikipedia.IsRedirectPage:
print Wikipedia.Page(page.site(), page._redirarg).aslink()
return
if not wikitext.strip():
wikipedia.output('Page does not exist')
return
elif '{{disambig' in wikitext:
printWiki(wikitext, docroot='readability.py?page=%s:' % page.site().language(), linkClass = '')
return
plaintext = removeWikiContainers(wikitext)
import subprocess
p = subprocess.Popen(
["/home/dispenser/bin/style"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
shell=False)
p.stdin.write(plaintext.encode('utf-8'))
print ''
for aline in p.communicate()[0].splitlines():
print aline#.split(': ')
print ''
print '
'
# Whitespace hack
# see http://developer.mozilla.org/en/Whitespace_in_the_DOM
print '''Show parsed text'''
printWiki(re.sub(r'(?m)^ *', r'', plaintext), linkClass="")
print '
'
def main():
global debug
debug = False
genFactory = pagegenerators.GeneratorFactory()
site = wikipedia.getSite()
generator = None
page = wikipedia.Page(site, '')
format = 'html'
for arg in wikipedia.handleArgs():
if arg.startswith('-debug'):
debug=True
elif arg.startswith('-format:'):
format = arg[8:]
print "Not implemented"
elif arg.startswith('-oldid:'):
oldid= arg[7:]
elif arg.startswith('-targetdate:'):
print "Not implemented"
offset = arg[12:]
else:
generator = genFactory.handleArg(arg)
# Start page
wikipedia.startContent(u'Readability: %s' % page.title(), form=False)
print ''
if not generator:
# syntax error, show help text from the top of this file
wikipedia.showHelp('readability14')
return
else:
for page in generator:
htmlStats(page)
if __name__ == "__main__" and wikipedia.handleUrlAndHeader():
try:
main()
finally:
wikipedia.endContent()
wikipedia.stopme()