#!/usr/bin/python # -*- coding: utf-8 -*- # TODO: # add per paragraph readability scores """ Readabilty.py (c) Dispenser, 2007 """ import re, wikipedia from math import sqrt import parser import cgi import cgitb; cgitb.enable() textFiles = './text/' ArticleWordStub = { 'en': "This article seems to be a stub." } ArticleWordLarge = { 'en': "This article seems to be too long and probably needs to be split." } ArticleLowIndex = { 'en': "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy." } ArticleHighIndex = { 'en': "This article seems to have too many long words and sentences for even most university graduates to easily read and understand." } ArticleSizeLarge = { 'en': "Almost certainly should be divided up" } ArticleSizeBig = { 'en': "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)" } Languages = { '': "English", 'simple': "Simple English", } Families = { 'wikibooks': "Wikibooks", 'wikipedia': "Wikipedia", 'wikisource': "Wikisource", 'wikinews': "Wikinews", 'wiktionary': "Wiktionary", 'wikiquote': "Wikiquote", 'wikiversity': "Wikiversity", } # How AWB does it # Remove tables and templates # Count instances using r"[a-zA-Z]+" flags = re.UNICODE WordCount = re.compile(r'[\w\-\']+', flags) #should include \- SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', flags) def printu(ustr): try: print ustr except: print ustr.encode('utf-8') def printOptions(dict, default): for key in dict: if key == default: print '' % (key, dict[key]) else: print '' % (key, dict[key]) def printWiki(text, basehref=None, linkClass='extiw'): # Replace math with static image text = re.sub(r'(?is)(.*?)', r'Math tag', text) text = parser.parser(text, docroot = basehref, allowHtml=True) # Remove Reference (no good way to parse) text = re.sub(r'(?is)]*>.*?|]*/>', '[ref]', text) print text.encode('utf-8') def wikiLink(text): for m in re.finditer(r'\[\[(?:([^][|]*)\||)(.*?)\]\]', text): link = (m.group(1) or m.group(2)).replace(' ', '_').strip('_') text = text.replace(m.group(), r'%s' % (hostname, link, m.group(2))) return text import time def profile(text): print '' % (time.clock(), text) ######################################### ## ######################################### def grade2age(grade): return int(grade+5.3) def rowPrint(*cells): print '' for cell in cells: print('%s' % wikiLink(str(cell))) print '' def average(list): if list == []: return 0 else: return float(sum(list))/float(len(list)) class ReadingNumbers(object): def __init__(self, IgnoreWords = []): #IgnoreWords = re.split(r'\W', page.title().lower()) IgnoreWords.append(u'wikipedia') def populateStats(self, text): self.text = text self.markedup = '' self.words = len(WordCount.findall(text)) or 1 self.sentences = len(SentenceCount.findall(text)) or 1 self.ascii = len(re.findall(r'(?u)[ -~]', text)) profile('senetence counting') for s in SentenceCount.finditer(text): self.markedup = self.markedup.replace(s.group(), '%s%s' % (s.group()[:-1],s.group()[-1])) # The actual staticstics stuff, should be branched off into an object/class PropNouns = []#re.findall(r'(?" % PropNouns) letters = 0 unrecognized = 0 self.HardWords = 0 self.monoWords = 0 syllable_list = [] self.longwords = len(re.findall(r'[^\W\d]{7,}', text)) profile('word analysis') VowelsCount = re.compile(r'(?i)[aeiouy]+') #for word in re.finditer(r'(?u)\w+', text): print '' self.letters = letters self.syllables = sum(syllable_list) self.avgSyllables = average(syllable_list) #print syllable_list print '' def makeIndexes(self): print '' self.ARI = ( 4.71 * self.letters/ float(self.words)) + ( 0.5 * float(self.words) / float(self.sentences) ) - 21.43 self.Flesch_Kincaid = ( 0.39 * self.words / float(self.sentences) ) + ( 11.8 * self.avgSyllables ) - 15.59 self.Cloeman_Liau = ( 5.89 * self.letters/ float(self.words) ) - ( 0.3 * float(self.sentences) / float(self.words) ) - 15.8 self.Gunning_fog = 0.4 * ( (float(self.words) / float(self.sentences)) + 100 * (float(self.HardWords) / float(self.words)) ) self.SMOG = 1.0430 * sqrt(30.0 * self.HardWords / float(self.sentences) ) + 3.1291 self.LIX = (self.words/float(self.sentences))+((100*self.longwords)/float(self.words)) self.Linsear = 0 self.fry_x = (self.avgSyllables*100) - 108 x = 100*float(self.sentences)/float(self.words) self.fry_y = -0.005*x**4 + 0.3764*x**3 - 10.611*x**2 + 136.93*x - 245.68 self.Fry = 0 def doStats(page, raw=None): if not raw: try: wikiText = page.get() wikiText = wikiText.replace('\r\n', '\n') except wikipedia.NoPage: print '
Page is empty/non-existant
' return except wikipedia.IsRedirectPage: print '#REDIRECT %s' % (page.hostname(), page.site().language(), page._redirarg, page._redirarg) return if not wikiText.strip(): print 'Page does not exist' return elif '{{disambig}}' in wikiText: printWiki(wikiText, basehref='?page=', linkClass = '') return elif raw: wikiText = raw if True: # indent hack profile('contents loaded') printu('%s' % (page.hostname(), page.urlname(), page.title())) print '' rowPrint('', 'Paragraphs (est.)', len(re.split(r'(?um)\n\s*[\n=]', wikiText))) rowPrint('', 'Links', len(re.findall(r'\[\[[^][:]+\]\]', wikiText))) rowPrint('', 'Images', len(re.findall(r'\[\[ *[Ii]mage:', wikiText))) rowPrint('', 'Categories', len(re.findall(r'\[\[ *[Cc]ategory:', wikiText))) rowPrint('', 'Interwiki links', len(re.findall(r'\[\[ *[a-z\-]{2,4}:', wikiText))) rowPrint('', 'External links', len(re.findall(r'\w+://\w+\.\w+', wikiText))) rowPrint('', 'References', wikiText.lower().count('')) rowPrint('', 'Templates', wikiText.lower().count('}}')) print '
' profile('base stats done') text = wikipedia.removeDisabledParts(wikiText) profile('removed disabled parts') # Remove links while re.search(r'\[\[[^][]+\]\]', text): text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE) text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL) text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL) profile('wikilink removal complete') # Remove templates # Tempate striper stack = [text.find('{{')] while 10 > len(stack) > 0: start = stack.pop() if start == -1: continue end = text.find('}}', start)+2 nextStart = text.find('{{', start+2) if end > nextStart > start: stack.append(start) stack.append(nextStart) elif end > start: text = text[:start] + text[end:] if stack == []: stack.append(text.find('{{', start)) profile('template removal complete') # Remove italics and bolding text = re.sub(r"(?m)'''(.*?)'''", r'\1', text) text = re.sub(r"(?m)''(.*?)''", r'\1', text) # Remove External links text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text) # Get "readable prose" prose = re.compile('|'.join(( r'\{\|.*?\|\}', r'/]*>.*?', r'\{\{.*?\}\}', r'^=+[^\n]+=+ *$', r'^ *\*[^\n]*$', r'^ *\#[^\n]*$', r'^ [^\n]+$', r'^;+[^\n]*$', #r'^:+[^\n]{1,200}$', # can do full definition list r"^:+ *\w[^\n]*$", # remove dab like lines r'&[^;]{2,6};')), re.IGNORECASE | re.DOTALL | re.MULTILINE | re.UNICODE).sub(r'', text) # Remove HTML text = re.sub(r'&#?\w{2,8};', r'', text) text = re.sub(r']*>', r'', text) prose= re.sub(r']*>', r'', prose) # Size Comparisions print '
' print '
' print 'Wikitext: %#.3g KB' % (len(wikiText)/1024.) print '
' % (len(wikiText)/350) print 'Text: %#.3g KB' % (len(text)/1024.) print '
' % (len(text)/350) print 'Readable prose: %#.3g KB' % (len(prose)/1024.) print '
' % (len(prose)/350) print '
' textstats = ReadingNumbers() textstats.populateStats(text) textstats.makeIndexes() prosestats = ReadingNumbers() prosestats.populateStats(prose) prosestats.makeIndexes() print '' print '' rowPrint('Words:', textstats.words, prosestats.words) rowPrint('Sentences:', textstats.sentences, prosestats.sentences) rowPrint('Letters:', textstats.letters, prosestats.letters) rowPrint('Syllables:', textstats.syllables, prosestats.syllables) rowPrint('Monosyllabic words:', textstats.monoWords, prosestats.monoWords) rowPrint('Polysyllabic words:', textstats.HardWords, prosestats.HardWords) print '
Plain textProse
'#''' print '' print '' rowPrint('ASCII:','%.2f%%'%(textstats.ascii*100/float(len(textstats.text)))) rowPrint('Characters per word:','%.2f'%(textstats.letters/float(textstats.words)), '%.2f'%(float(textstats.letters)/float(textstats.words))) rowPrint('Syllables per word:', '%.2f'%textstats.avgSyllables, '%.2f'%prosestats.avgSyllables) rowPrint('Words per sentence:', '%.2f'%(textstats.words/float(textstats.sentences)), '%.2f'%(float(prosestats.words)/float(prosestats.sentences))) print '
Plain textProse
'#''' # Readability Indexes (print) print '' print '' rowPrint('[[Flesch Reading Ease]]', '', '', '%.1f' % prosestats.Flesch_Ease) rowPrint('[[Automated Readability Index]] (ARI)','%.2f'%prosestats.ARI, grade2age(prosestats.ARI)) rowPrint('[[Flesch-Kincaid]]', '%.2f'%prosestats.Flesch_Kincaid, grade2age(prosestats.Flesch_Kincaid)) rowPrint('[[Coleman-Liau]]', '%.2f'%prosestats.Cloeman_Liau, grade2age(prosestats.Cloeman_Liau)) rowPrint('[[Gunning Fog]]', '%.2f'%prosestats.Gunning_fog, grade2age(prosestats.Gunning_fog)) rowPrint('[[SMOG]]', '%.2f'%prosestats.SMOG, grade2age(prosestats.SMOG)) rowPrint('[[Laesbarhedsindex]] (LIX)', '', '', '%.2f'%prosestats.LIX) rowPrint('[[Linsear Write]]', '%.2f'%prosestats.Linsear, '') rowPrint('[[Fry Readability Formula|Fry]]','%.2f'%prosestats.Fry, '') rowPrint('[[New Dale-Chall]]','', '') rowPrint('[[Raygor Estimate]]','', '') rowPrint('[[Rate Index]] (RIX)','', '') rowPrint('[[]]','', '') print '
TestU.S. grade levelReader ageScale
' print '
' fry_left = (prosestats.fry_x*22/2+42)/3 fry_top = (497 - prosestats.fry_y)/3 print '' % (prosestats.fry_x, prosestats.fry_y) print '
' print '
' print '' print '
' % (fry_left-4, fry_top-4, ) # print 'Br.ko
' print '
' print '' print '
' # wORD tOTALS print '' print '' print '
' print 'words, McAlpine EFLAW miniwords, 6+ characters, 3+ sllables words, Spache unfamiliar words, DC unfamiliar words, Monosyllablic words' print '
' profile('Suggestion') # Suggestion if textstats.words < 300: suggest = ArticleWordStub elif textstats.words > 10000: suggest = ArticleWordLarge elif textstats.SMOG <= 8: suggest = ArticleLowIndex elif textstats.SMOG >= 16: suggest = ArticleHighIndex elif len(text) > 75e3: suggest = ArticleSizeLarge elif len(text) > 50e3: suggest = ArticleSizeBig else: suggest = None if suggest: print '
' print '
Notes
' print '
%s
' % wikipedia.translate(page.site().language(), suggest) print '
' print wikiLink('
The follow is what the software to recognized as "[[Wikipedia:Article length#What is and is not included as .22readable prose.22|Readable Prose]]", underlined words are polysyllabic words (syllables ≥ 3) and highlighted words indicate punctuation. No direct action should result from the information provided here. Redistributed is under the terms of the [[Wikipedia:GFDL|GNU Free Documentation License]]
') profile('Analysis finished') print '
' print '

Wikitext

' printWiki(wikiText, basehref="?page=", linkClass="") profile('Printed wikitext') print '
' print '

Text

' printWiki(text, linkClass="") profile('Printed text') print '
' print '

Proses (Marked up)

' printWiki(prosestats.markedup, linkClass="") profile('Printed prose') print '
' # printu('%s' % text) profile('per paragraph stats') print """""" ## Laundry list ## # When conditions for readability is too short, turn the bar color red # Use javascript to select different test for readability print '
Per-paragraph SMOG indexes Graph' numlist = [] print '
' print '
' for para in prose.split('\n\n'): if not para.strip(' \t\r\n:.,<>[]\'"='): continue pn = ReadingNumbers() pn.populateStats(para) pn.makeIndexes() if pn.SMOG < 4: continue color = '#DAFF6B' if pn.words < 40: color = "#FF6B90" print '
%.2f
' % (pn.SMOG, color, cgi.escape(para[:100]).encode('utf-8'), pn.SMOG) numlist.append(pn.SMOG) print '
' avg = sum(numlist)/(len(numlist) or 1) print '


' % (avg-4.5,) print '

' % (avg-3.0,) print '

' % (avg-1.5,) print '
' % (avg, avg,) print '
' print """
LEGEND : Each bar corresponds to a paragraph. The red line is the mean of the bars, the boxes are the population standard distribution ranges. Thus, all the bars should lay within the boxes. Red bars indicate that the conditions for the test have not been meet.
""" print '
' def main(): form = cgi.FieldStorage() global debug raw = form.getfirst('raw', None) debug = bool(form.getfirst('debug', False)) host = form.getfirst('hostname', 'en.wikipedia.org').split('.') lang = form.getfirst('lang', host[0]) family = form.getfirst('family', host[1]) site = wikipedia.Site(lang, family) page = wikipedia.Page(site, form.getfirst('page', '')) global hostname hostname= page.hostname() wikipedia.startContent(u'Readability for %s' % page.title(), form=False) print '
' % page.aslink()[2:-2].encode('utf-8') # print '' # print '' print '
' if page.title(): doStats(page) elif raw: doStats(page, raw=raw.replace('_', ' ')) else: wikipedia.showHelp('readability') if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: main() finally: wikipedia.endContent()