#!/usr/bin/python # -*- coding: utf-8 -*- """ Readabilty.py ¶ms; -limit:n Stops after n pages Basic pagegenerators commands, -page, etc... TODO: * Change from color indicator to size indicator on bar graph, this is more intuative * Add more algorithems * MOS checking from automatic peer review * get alogrithems from Eti Yaari - [http://www.haaretz.com/hasen/spages/985096.html] """ # (C) 2008 - [[w:en:user:Dispenser]] # import re, math, time import wikipedia, pagegenerators from syllable import syllable unique_words = {} # Regexes WordCount = re.compile(r'[\w\-\']+') #should include - SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]') # Recommendations for script StubMsg = "This article seems to be a stub." LargeMsg = "This article seems to be too long and probably needs to be split." EasyMsg = "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy." HardMsg = "This article seems to have too many long words and sentences for even most university graduates to easily read and understand." HugeMsg = "Almost certainly should be divided up" largeMsg = "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)" # Mini library of common functions import time def profile(text): print '' % (time.clock(), text) def printu(ustr): try: print ustr except: print ustr.encode('utf-8') def rowPrint(*cells): print '' for cell in cells: print('%s' % wikiLink(str(cell))) print '' def printOptions(dict, default): for key in dict: if key == default: print '' % (key, dict[key]) else: print '' % (key, dict[key]) def wikiLink(text): for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text): link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_') text = text.replace(m.group(), r'%s' % (site.hostname(), link, m.group(2))) return text def age(grade): return int(grade+5.3) #import colorsys #def hlsHTML(h,l,s): # return "#%02x%02x%02x"%tuple(int(v*255) for v in colorsys.hls_to_rgb(h,l,s)) def Gaussian(x, height, center, width): """ f(x) = a * e ^ -( (x-b)^2 / 2c^2 ) where a is the height of the curve's peak b is the position of the center of the peak c controls the width of the "bell" """ return height*(2.7**(((x-center)**2)/(-2*width**2))) def colorValue(subset, total, revert = False, accuracy = 2): v = min(255 * subset / total, 255) if (revert): v = 255 - v; blue = 00 if (v < 128): # Red to Yellow red = 255 green = 2 * v else: # Yellow to Green red = 2 * (255 - v) green = 255 return "#%02X%02X%02X" % (red, green, blue) # class Readability(object): def __init__(self, text): """ """ self.text = text self.chars = len(text.strip()) self.words = 0 self.letters = 0 self.sentences = 0 self.syllables = 0 self.monosyllables = 0 self.polysyllables = 0 self.longwords = 0 global unique_words for word in re.findall(r'[\w\'-]+', text): try: unique_words[word.lower()] += 1 except: unique_words[word.lower()] = 1 self.words += 1 self.letters += len(re.sub(r'\W', '', word)) word_syllables = syllable(word) self.syllables += word_syllables if word_syllables < 2: self.monosyllables += 1 elif word_syllables >= 3: self.polysyllables += 1 if len(word) > 6: self.longwords += 1 self.sentences = len(re.findall(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', text)) """ print self.words print '
' print self.letters print '
' print self.sentences print '
' print self.syllables print '
monosyllables' print self.monosyllables print '
' print self.polysyllables print '
' print self.longwords print '
' """ def flesch(self): """ Rlodph Flesch """ return 206.8 - (self.syllables * 84.6/(self.words or 1)) - (self.words * 1.015/(self.sentences or 1)) def ari(self): """ Automated Reading Index (ARI) """ return ( 4.71 * self.letters/ float(self.words or 1)) + ( 0.5 * float(self.words) / float(self.sentences or 1) ) - 21.43 def kincaid(self): """ Flesch-Kincaid """ return ( 0.39 * self.words / float(self.sentences or 1) ) + ( 11.8 * self.syllables/(self.words or 1)) - 15.59 def colemanLiau(self): """ Coleman Liau 'A computer readability formula designed for machine scoring', Journal of Applied Psychology. (1975) """ return ( 5.89 * self.letters/ float(self.words or 1) ) - ( 0.3 * float(self.sentences) / float(self.words or 1) ) - 15.8 def fog(self): """ Gunning Fog, will change when I find Gunning orginal """ return 0.4 * ( (float(self.words) / float(self.sentences or 1)) + 100. * (float(self.polysyllables) / float(self.words or 1)) ) def smog(self): """ Simple Measure Of Gobbledygook """ return 1.0430*math.sqrt(self.polysyllables*30.0/(self.sentences or 1))+3.1291 def lix(self): """ Laesbarhedsindex (Any language) """ return (self.words/float(self.sentences or 1))+((100.*self.longwords)/float(self.words or 1)) def linsear(self): """ Linsear Write """ hardwords = self.polysyllables easywords = self.words - hardwords hardwords *= 3 r = hardwords+easywords r /= self.sentences or 1 if r > 20: r /= 2 elif r <= 20: r -= 2 r /= 2 return r def raygor(self): """ Raygor Estimate graph """ return 0 def rix(self): """ Rate Index """ return 0 def fry(self): """ Fry Readability formula Produces coordinates to place dot on graph """ self.fry_x = (self.syllables/(self.words or 1)*100) - 108 x = 100*float(self.sentences)/float(self.words or 1) self.fry_y = -0.005*x**4 + 0.3764*x**3 - 10.611*x**2 + 136.93*x - 245.68 self.Fry = 0 return 0 def prosegen(text): #text = re.sub(r']*>', '', text) #return re.sub(r'<(\w+)[^<>]*>', r'<\1>', text) return re.sub(r'(?m)^(?!

).*?$', '', text) def report(text): results = [] graph = "" count = 0 for m in re.finditer(r'(?m)^

.*?

$', text): prose = re.sub(r']*>', '', m.group()) if not prose: continue elif not '.' in prose: continue count += 1 text = text.replace(m.group(), ('

' % count) + m.group()[3:]) results.append( prose ) pn =Readability(prose) # L = min(max(0, (pn.words-30)/50.), 1.0) # graph += '

%.2f, words:%.2f
\n' % (pn.smog(), hlsHTML(50/255., 170/255., L), count, pn.smog(), pn.words)\ # bell = (255)*(2.7**(((pn.words-50)**2)/(-2*20**2))) graph += '
%.2f, words:%.2f
\n' % (pn.smog(), colorValue(Gaussian(pn.words, 512, 100, 40), 255), count, pn.smog(), pn.words) if results == []: print text.replace('"/wiki/', '"?page=') return ra = Readability(re.sub(r']*>', '', ''.join(results))) ## XXX # print '' % re.sub(r']*>', '', '\n\n'.join(results)) print '' rowPrint('Test', 'US grade level', 'Scale') rowPrint('[[Flesch]]', '', ra.flesch()) rowPrint('[[ARI]]', ra.ari()) rowPrint('[[kincaid]]', ra.kincaid()) rowPrint('[[coleman-Liau]]', ra.colemanLiau()) rowPrint('[[Gunning fog]]', ra.fog()) rowPrint('[[SMOG]]', ra.smog()) rowPrint('[[Laesbarhedsindex]] (LIX)', '', ra.lix()) rowPrint('[[Linsear Write]]', ra.linsear()) rowPrint('Fry', ra.fry()) print '
' result = sorted(unique_words.iteritems(), key=lambda (k,v): (v,k), reverse=True) count = 0 print '' print '' print '
Word Frequency
' for item in result: if item[0].isdigit():continue print '%s : %s
' % (item[1], item[0]) count += 1 if count >= 100: break elif count%20 == 0: print '
' print '
' avg = ra.smog() print '
' print '
' print graph print '
' % (avg-4.5,) # print '
' % (avg-3.0,) # print '
' % (avg-1.5,) print '
' % (avg, avg,) print '
' print """
LEGEND : Each bar corresponds to a paragraph. The red line is the mean of the bars, the boxes are the population standard distrabution ranges. Thus, all the bars should lay within the boxes. Red bars indicate that the conditions for the test have not been meet.
""" print '
' print """ """ % (site.hostname(), site.hostname()) htext = text.replace('"/wiki/', '"readability2.0.py?page=') htext = htext.replace('="/w/', '="http://%s/w/'%site.hostname().encode('utf-8')) print htext def main(): global site genFactory = pagegenerators.GeneratorFactory() generator = None for arg in wikipedia.handleArgs(): if arg.startswith('-text:'): text = arg[6:].replace('_', ' ') else: generator = genFactory.handleArg(arg) or generator #wikipedia.startContent(u'Readability for %s' % page.title(), form=False, submitLabel="Check page") #print '
' % page.title().encode('utf-8') #print '
' if not generator: # syntax error, show help text from the top of this file wikipedia.showHelp('readability2') return for page in generator: site = page.site() text = site.getUrl(site.nice_get_address(page.urlname())) text = re.search(r'(?su)\s*(.*?)