#!/usr/bin/python # -*- coding: utf-8 -*- """ Readabilty.py v1.4 """ # (c) 2008 - Dispenser # # TODO: # add per paragraph readability scores, width = readability, height=amount of text parsed, seprated by sections import re, math import wikipedia, pagegenerators import cgitb; cgitb.enable() try: import parser except ImportError: parser = None ArticleWordStub = "This article seems to be a stub. Readability scores will not be accurate." ArticleWordLarge = "This article seems to be too long and probably needs to be split." ArticleLowIndex = "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy." ArticleHighIndex = "This article seems to have too many long words and sentences for even most university graduates to easily read and understand." ArticleSizeLarge = "This article almost certainly should be divided up." ArticleSizeBig = "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)" # How AWB does it # Remove tables and templates # Count instances using r"[a-zA-Z]+" flags = re.UNICODE WordCount = re.compile(r'[^\W\d]+', flags) #should include \- SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', flags) def average(list): if list == []: return 0 else: return float(sum(list))/float(len(list)) def grade2ages(grade): # FIXME document where the 5.3 comes from return "%d/%d" % (grade+4.8, grade+5.8) # 5.3 +/- .5 def wikiLink(text, linkClass='extiw'): for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text): link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_') text = text.replace(m.group(), r'%s' % (hostname, link, linkClass, m.group(2),)) return text def rowPrint(*cells): print '' for cell in cells: print('%s' % wikiLink(str(cell))) print '' def printWiki(text, docroot=None, linkClass='extiw'): if parser: print parser.parser(text, docroot=docroot or 'http://%s/wiki/'%hostname, allowHtml=True, sanitize=True).encode('utf-8') else: wikipedia.output(text) def stripWiki(text): return text def divne(x, y): " Divide, no expections/errors " return float(x)/float(y or 1) ######################################## class ReadingNumbers(object): def __init__(self, IgnoreWords = []): #IgnoreWords = re.split(r'\W', page.title().lower()) IgnoreWords.append(u'wikipedia') def populateStats(self, text): # Remove HTML text = re.sub(r']*>', r' ', text) # store self.characters= len(text) self.text = text self.markedup = text self.words = len(WordCount.findall(text)) or 1 self.sentences = len(SentenceCount.findall(text)) or 1 self.longwords = 0 if debug: print '' for s in SentenceCount.finditer(text): self.markedup = self.markedup.replace(s.group(), '%s%s' % (s.group()[:-1],s.group()[-1])) # The actual staticstics stuff, should be branched off into an object/class PropNouns = re.findall(r'(?" % PropNouns) letters = 0 self.HardWords = 0 syllable_list = [] print '' #for word in re.finditer(r'(?u)\w+', text): for word in WordCount.finditer(text): s = word.group() if len(s) > 6: self.longwords += 1 # if s.lower() in IgnoreWords: # continue if len(s) > 1 and (s.endswith('e')): s = s[:-1] elif len(s) > 2 and (s.endswith('es') or s.endswith('ed')): s = s[:-2] elif len(s) > 3 and (s.endswith('ing') or s.endswith('ies')): s = s[:-3] elif len(s) > 3 and (s.endswith('ely')): s = s[:-3]+s[-2:] vowels = re.findall(r'(?i)[aeiouy]+', s) if vowels: letters += len(word.group()) # Make sure we're adding words syllable_list.append(len(vowels)) elif s.isdigit(): syllable_list.append(len(s)) if len(vowels) >= 3 and not word.group() in PropNouns and not s.isdigit(): self.HardWords += 1 self.markedup = re.sub(r'(%s)\b([^<>])' % word.group(), r'\1\2', self.markedup) self.letters = letters self.syllables = sum(syllable_list) self.avgSyllables = average(syllable_list) #print syllable_list print '' def printFryGraph(self): """ Prints an HTML version the fry graph """ # FIXME get the orginal fomula and read paper x = self.avgSyllables*100 y = 100*float(self.sentences)/float(self.words) fry_x = ((self.avgSyllables*100)-108)*22/2+42 fry_y = -0.005*y**4 + 0.3764*y**3 - 10.611*y**2 + 136.93*y - 245.68 print '
' print 'fry graph' print '
' % ((fry_x-4)/1.9275, (497-fry_y-4)/1.9275, ) print 'Br.ko' print '
\n
' def makeIndexes(self): # Calculate stats self.Flesch_Ease = 206.835 - (1.015 * self.words/ float(self.sentences)) - (84.6 * self.avgSyllables) self.ARI = ( 4.71 * self.letters/ float(self.words)) + ( 0.5 * float(self.words) / float(self.sentences) ) - 21.43 self.Flesch_Kincaid = ( 0.39 * self.words / float(self.sentences) ) + ( 11.8 * self.avgSyllables ) - 15.59 self.Cloeman_Liau = ( 5.89 * self.letters/ float(self.words) ) - ( 0.3 * float(self.sentences) / float(self.words) ) - 15.8 self.Gunning_fog = 0.4 * ( (float(self.words) / float(self.sentences)) + 100 * (float(self.HardWords) / float(self.words)) ) self.SMOG = 1.0430 * math.sqrt(30.0 * self.HardWords / float(self.sentences) ) + 3.1291 #LIX = W/S+(100*LW)/W self.LIX = divne(self.words, self.sentences)+divne(100.*self.longwords, self.words) # linsear write hardwords = self.HardWords #hardwords = self.polysyllables easywords = self.words - hardwords hardwords *= 3 r = hardwords+easywords r /= self.sentences or 1 if r > 20: r /= 2 elif r <= 20: r -= 2 r /= 2 self.linsearwrite = r # Fry readability graph/formula x = self.avgSyllables*100 y = 100*float(self.sentences)/float(self.words) z = 27.97134+(-99.5798/y) # linearize if z>(x*.230769)-7 : self.Fry=1 elif z>(x*.238462)-10.2692: self.Fry=2 elif z>(x*.275000)-17.2 : self.Fry=3 elif z>(x*.290000)-20.9 : self.Fry=4 elif z>(x*.400000)-40.6 : self.Fry=6 elif z>(x*.523077)-62.9077: self.Fry=7 elif z>(x*.303125)-24.3438: self.Fry=5 elif z>(x*.615000)-80.25 : self.Fry=8 elif z>(x*.900000)-128 : self.Fry=9 elif z>(x*.950000)-140.9 : self.Fry=10 elif z>(x*.933333)-142.733: self.Fry=11 elif z>(x*1.28750)-205.15 : self.Fry=12 else: self.Fry=13 def removeWikiContainers(text): text = wikipedia.removeDisabledParts(text) print '' # Remove links while re.search(r'\[\[[^][]+\]\]', text): text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE) text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL) text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL) print '' # Remove templates # Tempate striper stack = [text.find('{{')] while 10 > len(stack) > 0: start = stack.pop() if start == -1: continue end = text.find('}}', start)+2 nextStart = text.find('{{', start+2) if end > nextStart > start: stack.append(start) stack.append(nextStart) elif end > start: text = text[:start] + text[end:] if stack == []: stack.append(text.find('{{', start)) print '' # Remove italics and bolding text = text.replace("'''", "") text = text.replace("''", "") # Remove External links text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text) return text def htmlStats(page, date=None, oldid=None): try: # TODO implement date or oldid wikitext = page.get() except wikipedia.NoPage: wikipedia.output('Page is empty/non-existant') return except wikipedia.IsRedirectPage: print '#REDIRECT %s' % (page.hostname(), page.site().language(), page._redirarg, page._redirarg) return if not wikitext.strip(): wikipedia.output('Page does not exist') return elif '{{disambig' in wikitext: printWiki(wikitext, docroot='readability.py?page=%s:' % page.site().language(), linkClass = '') return print '' rowPrint('', 'Paragraphs (est.)', len(re.split(r'(?um)\n\s*[\n=]', wikitext))) rowPrint('', 'Links', len(re.findall(r'\[\[[^][:]+\]\]', wikitext))) rowPrint('', 'Images', len(re.findall(r'\[\[ *[Ii]mage:', wikitext))) rowPrint('', 'Categories', len(re.findall(r'\[\[ *[Cc]ategory:', wikitext))) rowPrint('', 'Interwiki links', len(re.findall(r'\[\[ *[a-z\-]{2,4}:', wikitext))) rowPrint('', 'External links', len(re.findall(r'\w+://\w+\.\w+', wikitext))) rowPrint('', 'References', wikitext.lower().count('')) rowPrint('', 'Templates', wikitext.lower().count('}}')) print '
' plaintext = removeWikiContainers(wikitext) textstats = ReadingNumbers() textstats.populateStats(plaintext) textstats.makeIndexes() # Get "readable prose" prosestats = ReadingNumbers() prosestats.populateStats(re.sub('|'.join(( r'/]*>.*?', r'\{\|.*?\|\}', r'\{\{.*?\}\}', r'^=+[^\n]+=+ *$', r'__[A-Z0-9]+__', r'^ *\*[^\n]*$', r'^ *\#[^\n]*$', r'^ [^\n]+$', r'^;+[^\n]*$', #r'^:+[^\n]{1,200}$', # can do full definition list r"^:+ *\w[^\n]*$", # remove dab like lines r'&[^;]{2,6};')), r'', plaintext, re.IGNORECASE | re.DOTALL | re.MULTILINE | re.UNICODE)) prosestats.makeIndexes() print '' print '' print '' print '' rowPrint('Words', textstats.words, prosestats.words) rowPrint('Characters', textstats.characters, prosestats.characters) rowPrint('Sentences', textstats.sentences, prosestats.sentences) rowPrint('Letters', textstats.letters, prosestats.letters) rowPrint('Syllables', textstats.syllables, prosestats.syllables) rowPrint('Polysyllabic words', textstats.HardWords, prosestats.HardWords) print '' rowPrint('Characters per word','%.2f'%(textstats.characters/float(textstats.words)),'%.2f'%(prosestats.characters/float(prosestats.words))) rowPrint('Letters per word','%.2f'%(float(textstats.letters)/float(textstats.words)),'%.2f'%(float(prosestats.letters)/float(prosestats.words))) rowPrint('Syllables per word', '%.2f'%textstats.avgSyllables, '%.2f'%prosestats.avgSyllables) rowPrint('Words per sentence', '%.2f'%(textstats.words/float(textstats.sentences)), '%.2f'%(float(prosestats.words)/float(prosestats.sentences))) print '
TextProses
Counts
Averages
'#''' # Size Comparisions print '
' print 'Wikitext: %3.3g KB' % (len(wikitext)/1024.) print '
' % (len(wikitext)/350) print 'Text: %3.3g KB' % (textstats.characters/1024.) print '
' % (textstats.characters/350) print 'Proses: %3.3g KB' % (prosestats.characters/1024.) print '
' % (prosestats.characters/350) print '
' # Clear print '
' # Fry graph print '
' prosestats.printFryGraph() # http://justendeal.com/blog/2006/01/03/reproducible-fry-graphs/#comment-101 print 'Fry readability graph. The line is an ideal balance between long sentences and long words.
x: average number of syllables per hundred words;
y: average number of sentences per hundred words.' print '
' # Readability Indexes (print) print '' print '' print '' rowPrint('[[Flesch Reading Ease]]', '', '', '%.1f' % prosestats.Flesch_Ease) rowPrint('[[Automated Readability Index]] (ARI)','%.1f'%prosestats.ARI, grade2ages(prosestats.ARI)) rowPrint('[[Flesch-Kincaid]]', '%3.1f'%prosestats.Flesch_Kincaid, grade2ages(prosestats.Flesch_Kincaid)) rowPrint('[[Coleman-Liau]]', '%3.1f'%prosestats.Cloeman_Liau, grade2ages(prosestats.Cloeman_Liau)) rowPrint('[[Gunning Fog]]', '%3.1f'%prosestats.Gunning_fog, grade2ages(prosestats.Gunning_fog)) rowPrint('[[SMOG]]', '%3.1f'%prosestats.SMOG, grade2ages(prosestats.SMOG)) if prosestats.Fry==13: rowPrint('[[Fry Readability Formula]]', '13+', '17+') else: rowPrint('[[Fry Readability Formula]]', prosestats.Fry, grade2ages(prosestats.Fry)) rowPrint('[[Laesbarhedsindex]] (LIX)', '', '', '%3.1f'%prosestats.LIX) rowPrint('[[Linsear Write]]', '%3.1f'%prosestats.linsearwrite, grade2ages(prosestats.linsearwrite)) print '' print '
Proses readability
TestU.S. grade levelReader ageScale
' # Suggestion if textstats.words < 300: suggest = ArticleWordStub elif textstats.words > 10000: suggest = ArticleWordLarge elif prosestats.SMOG <= 8: suggest = ArticleLowIndex elif prosestats.SMOG >= 16: suggest = ArticleHighIndex elif textstats.characters > 75e3: suggest = ArticleSizeLarge elif textstats.characters > 50e3: suggest = ArticleSizeBig else: suggest = None if suggest: print '
Alert
%s
' % suggest print '
' # Whitespace hack # see http://developer.mozilla.org/en/Whitespace_in_the_DOM print '''Show parsed text' def main(): global debug debug = False genFactory = pagegenerators.GeneratorFactory() site = wikipedia.getSite() generator = None page = wikipedia.Page(site, '') format = 'html' for arg in wikipedia.handleArgs(): if arg.startswith('-page:'): page = wikipedia.Page(site, arg[6:]) if arg.startswith('-lang'): print "Backward comptability not implemented" elif arg.startswith('-debug'): debug=True elif arg.startswith('-format:'): print "Not implemented" pass else: generator = genFactory.handleArg(arg) global hostname wikipedia.startContent(u'Readability: %s' % page.title(), form=False) print '
' print '' % page.aslink()[2:-2].encode('utf-8') print '
' hostname= page.hostname() if page.site().sitename() == 'wikipedia:en' and page.title(): print("""
Do not rely on information presented here. This tool produces incorrect output due to limitations in its wikitext to HTML parsing and a spurious syllable counter; not to mention the uncertainty of the readability algorithms.
""") if not generator: # syntax error, show help text from the top of this file wikipedia.showHelp('readability14') return else: for page in generator: htmlStats(page) if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: main() finally: wikipedia.endContent() wikipedia.stopme()