\n', text)
# definition lists
text = re.sub(r"\n;([^:\n]*)\n?", r'\n
\1
\n', text)
text = re.sub(r"\n:([^\n]*)\n?", r'\n
\1
', text)
# bullet and numbered lists
text = re.sub(r"\n ?\* *([^\n]*)", r'
\n\t
\1
\n
', text)
text = re.sub(r"\n ?\# *([^\n]*)", r'\n\t
\1
\n', text)
# Merge list together
text = text.replace("
\n", r'')
text = text.replace("\n", r'')
# Horizonal line
text = text.replace("\n----", '')
# Bold and Illatics
text = re.sub(r"'''([^\n]*?)'''", r'\1', text)
text = re.sub(r"''([^\n]*?)''", r'\1', text)
# space infront to
(simple ver.)
# text = re.sub(r'\n ([^\n]*)', r'\n
\1
', text)
# Internal links
text = re.compile(r'\[\[([^][]*?)\|([^][]*)\]\](\w*)', re.UNICODE).sub(r'\2\3' % hostname, text)
text = re.compile(r'\[\[([^][|]*?)\]\](\w*)', re.UNICODE).sub(r'\1\2' % hostname, text)
'''
# Simple Wikitable support
text = re.sub(r'\n\{\|([^\n]*)', r'\n
', text)
text = re.sub(r'\n\|-([^\n]*)', r'\n
', text)
text = re.sub(r'\n\|\}', r'\n
', text)
text = re.sub(r'\n!([^][\n|]*\|)?([^\n]*)', r'\n
\2
', text)
text = re.sub(r'\n\|([^][\n|]*\|)?([^\n]*)', r'\n
\2
', text) #'''
# External links
#text = re.sub(r'\[(\w+://[^][><" \n]+)\s*([^][]*?]*)\]', r'\2', text)
text = re.sub(r'\[(\w+://[^][><" \n]+)\s*([^]]*?]*)\]', r'\2', text)
global count
while(text.find(r'">') != -1):
count += 1
text = re.sub(r'">', r'">[%s]' % count, text, 1)
text = re.sub(r'(\A|[]<>\n])(\w+://[^][><\s"]*)', r'\1 \2 ', text)
# Encode & -> &
text = text.replace('&', '&')
text = re.sub(r'\&(#\d+|#x\w+|\w{1,6});', r'&\1;', text)
# New paragraphs
text = re.sub(r"\n\n+([^\n<>][^\n]*)", r'\n\n
' % (page._redirarg, page._redirarg)
return
if not wikiText.strip():
print 'Page does not exist'
return
#text = wikipedia.removeDisabledParts(wikiText)
text = re.compile(r'.*?||
.*?
|.*?|', re.IGNORECASE | re.DOTALL).sub('', wikiText)
print ''
# Remove links
while re.search(r'\[\[[^][]+\]\]', text):
text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE)
text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL)
text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL)
print ''
# Remove templates
#while '{{' in text and '}}' in text:
while re.findall(r'\{\{[^{}]*?\}\}', text, re.DOTALL):
text = re.compile(r'\{\{[^{}]*?\}\} *', re.DOTALL).sub(r'', text)
print ''
# Remove italics and bolding
text = re.sub(r"''(.*?)''|'''(.*?)'''", r'\1', text)
# Remove External links
text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text)
# Get "readable prose"
prose = re.compile('|'.join((
r'\{\|.*?\|\}',
r'/]*>.*?',
r'\{\{.*?\}\}',
r'^=+[^\n]+=+ *$',
r'^ *\*[^\n]*$',
r'^ *\#[^\n]*$',
r'^;+[^\n]*$',
#r'^:+[^\n]{1,200}$', # can do full definition list
r"^:+ *''\w[^\n]*$", # remove dab like lines
r'&[^;]{2,6};')), re.IGNORECASE | re.DOTALL | re.MULTILINE | re.UNICODE).sub(r'', text)
# Remove HTML
text = re.sub(r'?\w+[^<>]*>', r'', text)
prose= re.sub(r'?\w+[^<>]*>', r'', prose)
#print ''
WordCounter = re.compile(r'\b[^\W\d]+\b', re.UNICODE)
sentenceFinder = re.compile(r'\w*\w\w[ \'"\])]*[\.\!\?]+[\s\'"\]()]', re.UNICODE)
#sentenceFinder = re.compile(r'\.\s') # Tests other test
# lettersProse = len( re.findall(r'[^\W\d]', prose) )
wordsWiki = len( WordCounter.findall(wikiText)) or 1
wordsText = len( WordCounter.findall(text )) or 1
wordsProse = len( WordCounter.findall(prose )) or 1
sentencesWiki = len( sentenceFinder.findall(wikiText)) or 1
sentencesText = len( sentenceFinder.findall(text )) or 1
sentencesProse = len( sentenceFinder.findall(prose )) or 1
report = prose
# hightlight end of sentences
for s in sentenceFinder.findall(prose):
report = report.replace(s, '%s%s' % (s[:-1],s[-1]))
print ''
# The actual staticstics stuff, should be branched off into an object/class
IgnoreWords = re.split(r'\W', page.title().lower())
IgnoreWords.append(u'wikipedia')
PropNouns = re.findall(r'(?" % PropNouns)
print ''
# Calculate stats
Flesch_Ease = 206.835 - (1.015 * float(wordsProse) / float(sentencesProse)) - (84.6 * average(syllable_list))
AR_Index = ( 4.71 * float(lettersProse) / float(wordsProse)) + ( 0.5 * float(wordsProse) / float(sentencesProse) ) - 21.43
Kincaid_level = ( 0.39 * float(wordsProse) / float(sentencesProse) ) + ( 11.8 * average(syllable_list) ) - 15.59
Cloeman_Liau = ( 5.89 * float(lettersProse) / float(wordsProse) ) - ( 0.3 * float(sentencesProse) / float(wordsProse) ) - 15.8
Gunning_index = 0.4 * ( (float(wordsProse) / float(sentencesProse)) + 100 * (float(HardWords) / float(wordsProse)) )
import math
SMOG_Index = 1.0430 * math.sqrt(30.0 * float(HardWords) / float(sentencesProse) ) + 3.1291
# TODO: add per paragraph readability scores
print '
Notes:'
if wordsProse < 300:
print "This article seems to be a stub."
elif wordsProse > 10000:
print "This article seems to be too long and probably needs to be split."
elif SMOG_Index <= 8:
print "Longer words and more compound or complex sentences may make this article more interesting, more precise and less wordy."
elif SMOG_Index >= 16:
print "This article seems to have too many long words and sentences for even most university graduates to easily read and understand."
elif len(text) > 75e3:
print "Almost certainly should be divided up"
elif len(text) > 50e3:
print "Probably should be divided (although the scope of a topic can sometimes justify the added reading time)"
else:
print '-'
print '
'
print '
The follow is what the software to recognized as "Readable Prose", underlined words are polysyllabic words (syllables ≥ 3) and highlighed words indicate puncuation. No direct action should result from the information provided here.
'
print ''
printu(wikiToHTML(report))
# printu('
%r
' % report)
Languages = {
'en': "English",
'simple': "Simple English",
}
Families = {
'wikibooks': "Wikibooks",
'wikipedia': "Wikipedia",
'wikisource': "Wikisource",
'wikinews': "Wikinews",
'wiktionary': "Wiktionary",
'wikiquote': "Wikiquote",
'wikiversity': "Wikiversity",
}
def printOptions(dict, default):
for key in dict:
if key == default:
print '' % (key, dict[key])
else:
print '' % (key, dict[key])
#
# Here are the function that are stubed to reduce complexity
#
def main():
global threshold, hostname
form = cgi.FieldStorage()
host = form.getfirst('hostname', 'en.wikipedia.org').split('.')
lang = form.getfirst('lang', host[0])
family = form.getfirst('family', host[1])
site = wikipedia.Site(lang, family)
page = wikipedia.Page(site, form.getfirst('page', ''))
hostname = page.hostname()
#wikipedia.startContent()
HTMLheader(u'Readability for %s' % page.title())
print ''
print 'New version in the works check it out' % page.urlname()
if page.title():
doStats(page)
if __name__ == "__main__" and wikipedia.handleUrlAndHeader():
try:
main()
finally:
wikipedia.endContent()
wikipedia.stopme()