#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Readabilty.py
¶ms;
-limit:n Stops after n pages
Basic pagegenerators commands, -page, etc...
TODO:
* Change from color indicator to size indicator on bar graph, this is more intuative
* Add more algorithems
* MOS checking from automatic peer review
* get alogrithems from Eti Yaari - [http://www.haaretz.com/hasen/spages/985096.html]
"""
# (C) 2008 - [[w:en:user:Dispenser]]
#
import re, math, time
import wikipedia, pagegenerators
from syllable import syllable
unique_words = {}
# Regexes
WordCount = re.compile(r'[\w\-\']+') #should include -
SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]')
# Recommendations for script
StubMsg = "This article seems to be a stub."
LargeMsg = "This article seems to be too long and probably needs to be split."
EasyMsg = "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy."
HardMsg = "This article seems to have too many long words and sentences for even most university graduates to easily read and understand."
HugeMsg = "Almost certainly should be divided up"
largeMsg = "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)"
# Mini library of common functions
import time
def profile(text):
print '' % (time.clock(), text)
def printu(ustr):
try:
print ustr
except:
print ustr.encode('utf-8')
def rowPrint(*cells):
print '
'
for cell in cells:
print('
%s
' % wikiLink(str(cell)))
print '
'
def printOptions(dict, default):
for key in dict:
if key == default:
print '' % (key, dict[key])
else:
print '' % (key, dict[key])
def wikiLink(text):
for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text):
link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_')
text = text.replace(m.group(), r'%s' % (site.hostname(), link, m.group(2)))
return text
def age(grade):
return int(grade+5.3)
#import colorsys
#def hlsHTML(h,l,s):
# return "#%02x%02x%02x"%tuple(int(v*255) for v in colorsys.hls_to_rgb(h,l,s))
def Gaussian(x, height, center, width):
"""
f(x) = a * e ^ -( (x-b)^2 / 2c^2 )
where
a is the height of the curve's peak
b is the position of the center of the peak
c controls the width of the "bell"
"""
return height*(2.7**(((x-center)**2)/(-2*width**2)))
def colorValue(subset, total, revert = False, accuracy = 2):
v = min(255 * subset / total, 255)
if (revert):
v = 255 - v;
blue = 00
if (v < 128):
# Red to Yellow
red = 255
green = 2 * v
else:
# Yellow to Green
red = 2 * (255 - v)
green = 255
return "#%02X%02X%02X" % (red, green, blue)
#
class Readability(object):
def __init__(self, text):
"""
"""
self.text = text
self.chars = len(text.strip())
self.words = 0
self.letters = 0
self.sentences = 0
self.syllables = 0
self.monosyllables = 0
self.polysyllables = 0
self.longwords = 0
global unique_words
for word in re.findall(r'[\w\'-]+', text):
try:
unique_words[word.lower()] += 1
except:
unique_words[word.lower()] = 1
self.words += 1
self.letters += len(re.sub(r'\W', '', word))
word_syllables = syllable(word)
self.syllables += word_syllables
if word_syllables < 2:
self.monosyllables += 1
elif word_syllables >= 3:
self.polysyllables += 1
if len(word) > 6:
self.longwords += 1
self.sentences = len(re.findall(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', text))
"""
print self.words
print ''
print self.letters
print ''
print self.sentences
print ''
print self.syllables
print 'monosyllables'
print self.monosyllables
print ''
print self.polysyllables
print ''
print self.longwords
print ''
"""
def flesch(self):
"""
Rlodph Flesch
"""
return 206.8 - (self.syllables * 84.6/(self.words or 1)) - (self.words * 1.015/(self.sentences or 1))
def ari(self):
"""
Automated Reading Index (ARI)
"""
return ( 4.71 * self.letters/ float(self.words or 1)) + ( 0.5 * float(self.words) / float(self.sentences or 1) ) - 21.43
def kincaid(self):
"""
Flesch-Kincaid
"""
return ( 0.39 * self.words / float(self.sentences or 1) ) + ( 11.8 * self.syllables/(self.words or 1)) - 15.59
def colemanLiau(self):
"""
Coleman Liau
'A computer readability formula designed for machine scoring', Journal of Applied Psychology. (1975)
"""
return ( 5.89 * self.letters/ float(self.words or 1) ) - ( 0.3 * float(self.sentences) / float(self.words or 1) ) - 15.8
def fog(self):
"""
Gunning Fog, will change when I find Gunning orginal
"""
return 0.4 * ( (float(self.words) / float(self.sentences or 1)) + 100. * (float(self.polysyllables) / float(self.words or 1)) )
def smog(self):
"""
Simple Measure Of Gobbledygook
"""
return 1.0430*math.sqrt(self.polysyllables*30.0/(self.sentences or 1))+3.1291
def lix(self):
"""
Laesbarhedsindex (Any language)
"""
return (self.words/float(self.sentences or 1))+((100.*self.longwords)/float(self.words or 1))
def linsear(self):
"""
Linsear Write
"""
hardwords = self.polysyllables
easywords = self.words - hardwords
hardwords *= 3
r = hardwords+easywords
r /= self.sentences or 1
if r > 20:
r /= 2
elif r <= 20:
r -= 2
r /= 2
return r
def raygor(self):
"""
Raygor Estimate graph
"""
return 0
def rix(self):
"""
Rate Index
"""
return 0
def fry(self):
"""
Fry Readability formula
Produces coordinates to place dot on graph
"""
self.fry_x = (self.syllables/(self.words or 1)*100) - 108
x = 100*float(self.sentences)/float(self.words or 1)
self.fry_y = -0.005*x**4 + 0.3764*x**3 - 10.611*x**2 + 136.93*x - 245.68
self.Fry = 0
return 0
def prosegen(text):
#text = re.sub(r'?(a|cite|b|em|i|s|strong|span|sup|sub|)\b[^<>]*>', '', text)
#return re.sub(r'<(\w+)[^<>]*>', r'<\1>', text)
return re.sub(r'(?m)^(?!
).*?$', '', text)
def report(text):
results = []
graph = ""
count = 0
for m in re.finditer(r'(?m)^
.*?
$', text):
prose = re.sub(r'?\w+[^<>]*>', '', m.group())
if not prose: continue
elif not '.' in prose: continue
count += 1
text = text.replace(m.group(), ('
'
for item in result:
if item[0].isdigit():continue
print '%s : %s ' % (item[1], item[0])
count += 1
if count >= 100:
break
elif count%20 == 0:
print '
'
print '
'
avg = ra.smog()
print '
'
print '
'
print graph
print '
3σ
' % (avg-4.5,)
# print '
2σ
' % (avg-3.0,)
# print '
1σ
' % (avg-1.5,)
print '' % (avg, avg,)
print '
'
print """
LEGEND : Each bar corresponds to a paragraph. The red line is the mean of the bars, the boxes are the population standard distrabution ranges. Thus, all the bars should lay within the boxes. Red bars indicate that the conditions for the test have not been meet.
"""
print '
'
print """
""" % (site.hostname(), site.hostname())
htext = text.replace('"/wiki/', '"readability2.0.py?page=')
htext = htext.replace('="/w/', '="http://%s/w/'%site.hostname().encode('utf-8'))
print htext
def main():
global site
genFactory = pagegenerators.GeneratorFactory()
generator = None
for arg in wikipedia.handleArgs():
if arg.startswith('-text:'):
text = arg[6:].replace('_', ' ')
else:
generator = genFactory.handleArg(arg) or generator
#wikipedia.startContent(u'Readability for %s' % page.title(), form=False, submitLabel="Check page")
#print ''
if not generator:
# syntax error, show help text from the top of this file
wikipedia.showHelp('readability2')
return
for page in generator:
site = page.site()
text = site.getUrl(site.nice_get_address(page.urlname()))
text = re.search(r'(?su)\s*(.*?)