\n", r'')
# Horizonal line
text = text.replace("\n----", '
')
# Bold and Illatics
text = re.sub(r"(?m)'''(.*?)'''", r'\1', text)
text = re.sub(r"(?m)''(.*?)''", r'\1', text)
## space infront to (simple ver.)
#text = re.sub(r'\n ([^\n]*)', r'\n\1
', text)
#text = text.replace('
\n', '\n')
# Internal links
for l in re.finditer(r'\[\[(?!Image:)(?:([^][|]+)\||)([^][]+)\]\](\w*)', text, re.UNICODE):
link = l.group(1) or l.group(2)
title= l.group(2)
hyperlink = (link[0].upper()+link[1:]).replace(' ', '_').strip('_:')
text = text.replace(l.group(), '%s%s' % (basehref, hyperlink, link, linkClass, title, l.group(3)))
# Images
for wimg in re.finditer(r'(?s)\[\[Image:([^\[|\]]+)(\|.*?|)\]\]', text):
file = 'Image:%s' % wimg.group(1).replace('_', ' ').strip()
attribs = wimg.group(2).split('|')
size = ''
float = None
thumb = False
caption = ''
captionText = ''
for s in wimg.group(2).split('|'):
sl = s.lower()
if sl.endswith('px'):
try:
size = int(s[:-2])
except:
pass
elif sl=='left' or sl=='right':
float = s
elif sl=='thumb' or sl=='thumbnail':
thumb = True
size = size or 64
float = float or 'right'
else:
caption = s
captionText = wikipedia.escape(re.sub(r'?\w+[^<>]*>', '', s))
if thumb:
text = text.replace(wimg.group(), """
%s
""" % (float, size+2, hostname, file.replace(' ', '_'), file, captionText, size, caption ))
else:
text = text.replace(wimg.group(), '
' % (basehref, file, hostname, file, size, float, captionText))
# External links
global BracketCount
text = re.sub(r'\[(\w+://[^][><" \n]+)\s*([^]]*?\]*)\]', r'\2', text)
while '">' in text:
BracketCount+=1
text = text.replace('">', '">[%d]' % BracketCount, 1)
text = re.sub(r'(\A|[]<>\n])(\w+://[^][><\s"]*)', r'\1 \2 ', text)
# Encode & -> &
text = text.replace('&', '&')
text = re.sub(r'\&(#\d+|#x\w+|\w{1,6});', r'&\1;', text)
# Remove Reference (no good way to parse)
text = re.sub(r'(?is)]*>.*?|[]*/>', '[ref]', text)
# Wiki markup table paser
stack = [text.find('{|')]
while 10 > len(stack) > 0:
start = stack.pop()
if start == -1:
continue
end = text.find('|}', start)+2
nextStart = text.find('{|', start+2)
if end > nextStart > start:
stack.append(start)
stack.append(nextStart)
else:
table = text[start:end]
table = re.sub(r'(?m)^\{\| *(.*)', r']', table)
table = table.replace('\n|}', '\n
')
table = re.sub(r'(?m)^\|- *(.*)', r'', table)
table = re.sub(r'(?m)^\|\+ *([^][\n|]*\||)(.*)', r'\2', table)
while re.findall(r'(?m)^([!|])(.*?)(\|\||!!)', table):
for r in re.finditer(r'(?m)^([!|])(.*?)(\|\||!!)', table):
table = table.replace(r.group(), r.group(1)+r.group(2)+'\n'+r.group(1))
table = re.sub(r'(?m)^\! *([^][<>\n|]*\||)(.*)', r'\2 | ', table)
table = re.sub(r'(?m)^\| *([^][<>\n|]*\||)(.*)', r'\2 | ', table)
# Optimize s
table = table.replace('
', '')
text = text[:start] + table + text[end:]
if stack == []:
stack.append(text.find('{|', start+2))
# New paragraphs
text = re.sub(r"\n\n+([^\n<>][^\n]*)", r'\n\n\1
', text)
return text
#########################################
##
#########################################
def grade2age(grade):
return int(grade+5.3)
def rowPrint(*cells):
print ''
for cell in cells:
print('| %s | ' % wikiLink(str(cell)))
print '
'
def average(list):
if list == []:
return 0
else:
return float(sum(list))/float(len(list))
class ReadingNumbers(object):
def __init__(self):
# How AWB does it
# Remove tables and templates
# Count instances using r"[a-zA-Z]+"
flags = re.UNICODE
self.WordCount = re.compile(r'[^\W\d]+', flags)
self.SentenceCount = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', flags)
#IgnoreWords = re.split(r'\W', page.title().lower())
#IgnoreWords.append(u'wikipedia')
def populateStats(self, text):
self.markedup = text
self.words = len(self.WordCount.findall(text)) or 1
self.sentences = len(self.SentenceCount.findall(text)) or 1
print ''
for s in self.SentenceCount.finditer(text):
report = report.replace(s.group(), '%s%s' % (s[:-1],s[-1]))
# The actual staticstics stuff, should be branched off into an object/class
PropNouns = re.findall(r'(?" % PropNouns)
letters = 0
HardWords = 0
syllable_list = []
print ''
for word in self.WordCount.finditer(text):
s = word
# if s.lower() in IgnoreWords:
# continue
if len(s) > 1 and (s.endswith('e')):
s = s[:-1]
elif len(s) > 2 and (s.endswith('es') or s.endswith('ed')):
s = s[:-2]
elif len(s) > 3 and (s.endswith('ing') or s.endswith('ies')):
s = s[:-3]
elif len(s) > 3 and (s.endswith('ely')):
s = s[:-3]+s[-2:]
vowels = re.findall(r'(?i)[aeiouy]+', s)
if vowels:
letters += len(word) # Make sure we're adding words
syllable_list.append(len(vowels))
if len(vowels) >= 3 and not word in PropNouns:
HardWords += 1
report = re.sub(r'(%s)\b([^<>])' % word, r'\1\2', report)
self.letters = letters
#print syllable_list
if debug:
print ''
print ( 0.39 * float(self.words) / float(self.sentences) )
print ( 11.8 * average(syllable_list) )
print 'syllable list length: %d' % len(syllable_list)
print 'sum of syllable:%d' % sum(syllable_list)
print '
'
def makeIndexes():
# Calculate stats
self.Flesch_Ease = 206.835 - (1.015 * self.words/ float(self.sentences)) - (84.6 * average(syllable_list))
self.ARI = ( 4.71 * self.letters/ float(self.words)) + ( 0.5 * float(self.words) / float(self.sentences) ) - 21.43
self.Flesch_Kincaid = ( 0.39 * self.words / float(self.sentences) ) + ( 11.8 * average(syllable_list) ) - 15.59
self.Cloeman_Liau = ( 5.89 * self.letters/ float(self.words) ) - ( 0.3 * float(self.sentences) / float(self.words) ) - 15.8
self.Gunning_fog = 0.4 * ( (float(self.words) / float(self.sentences)) + 100 * (float(HardWords) / float(self.words)) )
self.SMOG = 1.0430 * sqrt(30.0 * HardWords / float(self.sentences) ) + 3.1291
#LIX = W/S+(100*LW)/W
def doStats(page):
try:
wikiText = page.get()
wikiText = wikiText.replace('\r\n', '\n')
except wikipedia.NoPage:
print 'Page is empty/non-existant
'
return
except wikipedia.IsRedirectPage:
print '
%s' % (page.hostname(), page.site().language(), page._redirarg, page._redirarg)
return
if not wikiText.strip():
print 'Page does not exist'
return
elif '{{disambig}}' in wikiText:
printWiki(wikiText, basehref='?page=', linkClass = '')
return
print """
"""
printu('%s' % (page.hostname(), page.urlname(), page.title()))
printu('' % (page.site().language(),page.urlname()))
print ''
rowPrint('
',
'Links', len(re.findall(r'\[\[[^][:]+\]\]', wikiText)))
rowPrint('
',
'Images', len(re.findall(r'\[\[ *[Ii]mage:', wikiText)))
rowPrint('
',
'Categories', len(re.findall(r'\[\[ *[Cc]ategory:', wikiText)))
rowPrint('
',
'Interwiki links', len(re.findall(r'\[\[ *[a-z\-]{2,4}:', wikiText)))
rowPrint('
',
'External links', len(re.findall(r'\w+://\w+\.\w+', wikiText)))
rowPrint('
',
'References', wikiText.lower().count(''))
rowPrint('
',
'Templates', wikiText.lower().count('}}'))
print '
'
text = wikipedia.removeDisabledParts(wikiText)
print ''
# Remove links
while re.search(r'\[\[[^][]+\]\]', text):
text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE)
text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL)
text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL)
print ''
# Remove templates
#while '{{' in text and '}}' in text:
while re.findall(r'\{\{[^{}]*?\}\}', text, re.DOTALL):
text = re.compile(r'\{\{[^{}]*?\}\}', re.DOTALL).sub(r'', text)
print ''
# Remove italics and bolding
text = re.sub(r"(?m)'''(.*?)'''", r'\1', text)
text = re.sub(r"(?m)''(.*?)''", r'\1', text)
# Remove External links
text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text)
# Get "readable prose"
prose = re.compile('|'.join((
r'\{\|.*?\|\}',
r'[/]*>.*?]',
r'\{\{.*?\}\}',
r'^=+[^\n]+=+ *$',
r'^ *\*[^\n]*$',
r'^ *\#[^\n]*$',
r'^ [^\n]+$',
r'^;+[^\n]*$',
#r'^:+[^\n]{1,200}$', # can do full definition list
r"^:+ *\w[^\n]*$", # remove dab like lines
r'&[^;]{2,6};')), re.IGNORECASE | re.DOTALL | re.MULTILINE | re.UNICODE).sub(r'', text)
# Remove HTML
text = re.sub(r'?\w+[^<>]*>', r'', text)
prose= re.sub(r'?\w+[^<>]*>', r'', prose)
# Size Comparisions
print '| '
print ''
print 'Wikitext: %#.3g KB' % (len(wikiText)/1024.)
print '' % (len(wikiText)/350)
print 'Plain text: %#.3g KB' % (len(text)/1024.)
print '' % (len(text)/350)
print 'Readabible proses: %#.3g KB' % (len(prose)/1024.)
print '' % (len(prose)/350)
print ' |
'
WordCounter = re.compile(r'\b[^\W\d]+\b', re.UNICODE)
sentenceFinder = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', re.UNICODE)
wordsText = len( WordCounter.findall(text )) or 1
wordsProse = len( WordCounter.findall(prose )) or 1
sentencesText = len( sentenceFinder.findall(text )) or 1
sentencesProse = len( sentenceFinder.findall(prose )) or 1
report = prose
# hightlight end of sentences
for s in sentenceFinder.findall(prose):
report = report.replace(s, '%s%s' % (s[:-1],s[-1]))
print ''
# The actual staticstics stuff, should be branched off into an object/class
IgnoreWords = re.split(r'\W', page.title().lower())
IgnoreWords.append(u'wikipedia')
PropNouns = re.findall(r'(?" % PropNouns)
print ''
# Calculate stats
Flesch_Ease = 206.835 - (1.015 * wordsProse / float(sentencesProse)) - (84.6 * average(syllable_list))
ARI = ( 4.71 * lettersProse / float(wordsProse)) + ( 0.5 * float(wordsProse) / float(sentencesProse) ) - 21.43
Flesch_Kincaid = ( 0.39 * wordsProse / float(sentencesProse) ) + ( 11.8 * average(syllable_list) ) - 15.59
Cloeman_Liau = ( 5.89 * lettersProse / float(wordsProse) ) - ( 0.3 * float(sentencesProse) / float(wordsProse) ) - 15.8
Gunning_fog = 0.4 * ( (float(wordsProse) / float(sentencesProse)) + 100 * (float(HardWords) / float(wordsProse)) )
SMOG = 1.0430 * sqrt(30.0 * HardWords / float(sentencesProse) ) + 3.1291
#LIX = W/S+(100*LW)/W
print ''
print ' | Plain text | Prose |
'
rowPrint('Words', wordsText, wordsProse)
rowPrint('Sentences',sentencesText,sentencesProse)
rowPrint('Letters', '', lettersProse)
rowPrint('Polysyllabic words', '', HardWords)
rowPrint('Characters per word','', '%.2f' % (float(lettersProse)/float(wordsProse)))
rowPrint('Syllables per word', '', '%.2f' % average(syllable_list))
rowPrint('Words per sentence', '', '%.2f' % (float(wordsProse)/float(sentencesProse)))
print '
'
# Readability Indexes (print)
print ''
print '| Test | U.S. grade level | Reader age | Scale |
'
rowPrint('[[Flesch Reading Ease]]', '', '', '%.1f' % Flesch_Ease)
rowPrint('[[Automated Readability Index]]','%.2f'%ARI, grade2age(ARI))
rowPrint('[[Flesch-Kincaid Grade Level]]', '%.2f'%Flesch_Kincaid, grade2age(Flesch_Kincaid))
rowPrint('[[Coleman-Liau Index]]', '%.2f'%Cloeman_Liau, grade2age(Cloeman_Liau))
rowPrint('[[Gunning fog index]]', '%.2f'%Gunning_fog, grade2age(Gunning_fog))
rowPrint('[[SMOG Index]]', '%.2f'%SMOG, grade2age(SMOG))
rowPrint('[[Laesbarhedsindex]] (LIX)', '', '')
rowPrint('[[Linsear Write]]', '', '')
print '
'
print ''
# Suggestion
if wordsProse < 300:
suggest = ArticleWordStub
elif wordsProse > 10000:
suggest = ArticleWordLarge
elif SMOG <= 8:
suggest = ArticleLowIndex
elif SMOG >= 16:
suggest = ArticleHighIndex
elif len(text) > 75e3:
suggest = ArticleSizeLarge
elif len(text) > 50e3:
suggest = ArticleSizeBig
else:
suggest = None
if suggest:
print ''
print '- Notes
'
print '- %s
' % wikipedia.translate(page.site().language(), suggest)
print '
'
printWiki('The follow is what the software to recognized as "[[Wikipedia:Article length#What is and is not included as .22readable prose.22|Readable Prose]]", underlined words are polysyllabic words (syllables ≥ 3) and highlighed words indicate puncuation. No direct action should result from the information provided here. Redistrubuted under the [[Wikipedia:Text of the GNU Free Documentation License|GNU Free Documentation License]]
')
print ''
print 'Wikitext'
printWiki(wikiText, basehref="?page=", linkClass="")
print ' | '
print 'Text'
printWiki(text, linkClass="")
print ' | '
print 'Proses (Marked up)'
printWiki(report, linkClass="")
print ' |
'
def main():
form = cgi.FieldStorage()
global debug
debug = bool(form.getfirst('debug', False))
host = form.getfirst('hostname', 'en.wikipedia.org').split('.')
lang = form.getfirst('lang', host[0])
family = form.getfirst('family', host[1])
site = wikipedia.Site(lang, family)
page = wikipedia.Page(site, form.getfirst('page', ''))
global hostname
hostname= page.hostname()
wikipedia.startContent(u'Readability for %s' % page.title(), form=False)
print ''
if page.title():
doStats(page)
if __name__ == "__main__" and wikipedia.handleUrlAndHeader():
try:
main()
finally:
wikipedia.endContent()
wikipedia.stopme()