\n", r'')
# Horizonal line
text = text.replace("\n----", '
')
# Bold and Illatics
text = re.sub(r"(?m)'''(.*?)'''", r'\1', text)
text = re.sub(r"(?m)''(.*?)''", r'\1', text)
## space infront to (simple ver.)
#text = re.sub(r'\n ([^\n]*)', r'\n\1
', text)
#text = text.replace('
\n', '\n')
# Internal links
for l in re.finditer(r'\[\[(?!Image:)(?:([^][|]+)\||)([^][]+)\]\](\w*)', text, re.UNICODE):
link = l.group(1) or l.group(2)
title= l.group(2)
hyperlink = (link[0].upper()+link[1:]).replace(' ', '_').strip('_:')
text = text.replace(l.group(), '%s%s' % (basehref, hyperlink, link, linkClass, title, l.group(3)))
# Images
for wimg in re.finditer(r'(?s)\[\[Image:([^\[|\]]+)(\|.*?|)\]\]', text):
file = 'Image:%s' % wimg.group(1).replace('_', ' ').strip()
attribs = wimg.group(2).split('|')
size = ''
float = None
thumb = False
caption = ''
captionText = ''
for s in wimg.group(2).split('|'):
sl = s.lower().strip()
if sl.endswith('px'):
try:
size = (int(s[:-2])*64/250)
except:
pass
elif sl=='left' or sl=='right':
float = s
elif sl=='thumb' or sl=='thumbnail':
thumb = True
size = size or 64
float = float or 'right'
else:
caption = s
captionText = wikipedia.escape(re.sub(r'?\w+[^<>]*>', '', s))
if thumb:
text = text.replace(wimg.group(), """
%s
""" % (float, size+2, hostname, file.replace(' ', '_'), file, captionText, size, caption ))
else:
text = text.replace(wimg.group(), '
' % (basehref, file, hostname, file, size, float, captionText))
# External links
global BracketCount
text = re.sub(r'\[(\w+://[^][><" \n]+)\s*([^]]*?\]*)\]', r'\2', text)
while '">' in text:
BracketCount+=1
text = text.replace('">', '">[%d]' % BracketCount, 1)
text = re.sub(r'(\A|[]<>\n])(\w+://[^][><\s"]*)', r'\1 \2 ', text)
# Encode & -> &
text = text.replace('&', '&')
text = re.sub(r'\&(#\d+|#x\w+|\w{1,6});', r'&\1;', text)
# Remove Reference (no good way to parse)
text = re.sub(r'(?is)]*>.*?|[]*/>', '[ref]', text)
# Wiki markup table paser
stack = [text.find('{|')]
while 10 > len(stack) > 0:
start = stack.pop()
if start == -1:
continue
end = text.find('|}', start)+2
nextStart = text.find('{|', start+2)
if end > nextStart > start:
stack.append(start)
stack.append(nextStart)
else:
table = text[start:end]
table = re.sub(r'(?m)^\{\| *(.*)', r']', table)
table = table.replace('\n|}', '\n
')
table = re.sub(r'(?m)^\|- *(.*)', r'', table)
table = re.sub(r'(?m)^\|\+ *([^][\n|]*\||)(.*)', r'\2', table)
while re.findall(r'(?m)^([!|])(.*?)(\|\||!!)', table):
for r in re.finditer(r'(?m)^([!|])(.*?)(\|\||!!)', table):
table = table.replace(r.group(), r.group(1)+r.group(2)+'\n'+r.group(1))
table = re.sub(r'(?m)^\! *([^][<>\n|]*\||)(.*)', r'\2 | ', table)
table = re.sub(r'(?m)^\| *([^][<>\n|]*\||)(.*)', r'\2 | ', table)
# Optimize s
table = table.replace('
', '')
text = text[:start] + table + text[end:]
if stack == []:
stack.append(text.find('{|', start+2))
# New paragraphs
text = re.sub(r"\n\n+([^\n<>][^\n]*)", r'\n\n\1
', text)
return text
#########################################
##
#########################################
def grade2age(grade):
return int(grade+5.3)
def rowPrint(*cells):
print ''
for cell in cells:
print('| %s | ' % wikiLink(str(cell)))
print '
'
def average(list):
if list == []:
return 0
else:
return float(sum(list))/float(len(list))
class ReadingNumbers(object):
def __init__(self, IgnoreWords = []):
#IgnoreWords = re.split(r'\W', page.title().lower())
IgnoreWords.append(u'wikipedia')
def populateStats(self, text):
self.text = text
self.markedup = text
self.words = len(WordCount.findall(text)) or 1
self.sentences = len(SentenceCount.findall(text)) or 1
print ''
for s in SentenceCount.finditer(text):
self.markedup = self.markedup.replace(s.group(), '%s%s' % (s.group()[:-1],s.group()[-1]))
# The actual staticstics stuff, should be branched off into an object/class
PropNouns = re.findall(r'(?" % PropNouns)
letters = 0
self.HardWords = 0
syllable_list = []
print ''
#for word in re.finditer(r'(?u)\w+', text):
for word in WordCount.finditer(text):
s = word.group()
# if s.lower() in IgnoreWords:
# continue
if len(s) > 1 and (s.endswith('e')):
s = s[:-1]
elif len(s) > 2 and (s.endswith('es') or s.endswith('ed')):
s = s[:-2]
elif len(s) > 3 and (s.endswith('ing') or s.endswith('ies')):
s = s[:-3]
elif len(s) > 3 and (s.endswith('ely')):
s = s[:-3]+s[-2:]
vowels = re.findall(r'(?i)[aeiouy]+', s)
if vowels:
letters += len(word.group()) # Make sure we're adding words
syllable_list.append(len(vowels))
elif s.isdigit():
syllable_list.append(len(s))
if len(vowels) >= 3 and not word.group() in PropNouns and not s.isdigit():
self.HardWords += 1
self.markedup = re.sub(r'(%s)\b([^<>])' % word.group(), r'\1\2', self.markedup)
self.letters = letters
self.syllables = sum(syllable_list)
self.avgSyllables = average(syllable_list)
#print syllable_list
print ''
def makeIndexes(self):
# Calculate stats
self.Flesch_Ease = 206.835 - (1.015 * self.words/ float(self.sentences)) - (84.6 * self.avgSyllables)
self.ARI = ( 4.71 * self.letters/ float(self.words)) + ( 0.5 * float(self.words) / float(self.sentences) ) - 21.43
self.Flesch_Kincaid = ( 0.39 * self.words / float(self.sentences) ) + ( 11.8 * self.avgSyllables ) - 15.59
self.Cloeman_Liau = ( 5.89 * self.letters/ float(self.words) ) - ( 0.3 * float(self.sentences) / float(self.words) ) - 15.8
self.Gunning_fog = 0.4 * ( (float(self.words) / float(self.sentences)) + 100 * (float(self.HardWords) / float(self.words)) )
self.SMOG = 1.0430 * sqrt(30.0 * self.HardWords / float(self.sentences) ) + 3.1291
#LIX = W/S+(100*LW)/W
def doStats(page):
try:
wikiText = page.get()
wikiText = wikiText.replace('\r\n', '\n')
except wikipedia.NoPage:
print 'Page is empty/non-existant
'
return
except wikipedia.IsRedirectPage:
print '
%s' % (page.hostname(), page.site().language(), page._redirarg, page._redirarg)
return
if not wikiText.strip():
print 'Page does not exist'
return
elif '{{disambig}}' in wikiText:
printWiki(wikiText, basehref='?page=%s:' % page.site().language(), linkClass = '')
return
print ''
printu('
' % (page.hostname(), page.urlname(), page.title()))
print '
'
rowPrint('¶',
'Paragraphs (est.)', len(re.split(r'(?um)\n\s*[\n=]', wikiText)))
rowPrint('
',
'Links', len(re.findall(r'\[\[[^][:]+\]\]', wikiText)))
rowPrint('
',
'Images', len(re.findall(r'\[\[ *[Ii]mage:', wikiText)))
rowPrint('
',
'Categories', len(re.findall(r'\[\[ *[Cc]ategory:', wikiText)))
rowPrint('
',
'Interwiki links', len(re.findall(r'\[\[ *[a-z\-]{2,4}:', wikiText)))
rowPrint('
',
'External links', len(re.findall(r'\w+://\w+\.\w+', wikiText)))
rowPrint('
',
'References', wikiText.lower().count(''))
rowPrint('
',
'Templates', wikiText.lower().count('}}'))
print '
'
text = wikipedia.removeDisabledParts(wikiText)
print ''
# Remove links
while re.search(r'\[\[[^][]+\]\]', text):
text = re.sub(r'\[\[(Image|Category|[A-Za-z]{2,3}(-[A-Za-z]+)*|simple):[^][]*\]\]', '', text, re.IGNORECASE | re.DOTALL | re.UNICODE)
text = re.sub(r'\[\[([^]|[]+\|)?([^][]+)\]\]', r'\2', text, re.IGNORECASE|re.DOTALL)
text = re.sub(r'\[\[Image:[^][]*\]\]', '', text, re.IGNORECASE|re.DOTALL)
print ''
# Remove templates
# Tempate striper
stack = [text.find('{{')]
while 10 > len(stack) > 0:
start = stack.pop()
if start == -1: continue
end = text.find('}}', start)+2
nextStart = text.find('{{', start+2)
if end > nextStart > start:
stack.append(start)
stack.append(nextStart)
elif end > start:
text = text[:start] + text[end:]
if stack == []:
stack.append(text.find('{{', start))
print ''
# Remove italics and bolding
text = re.sub(r"(?m)'''(.*?)'''", r'
\1', text)
text = re.sub(r"(?m)''(.*?)''", r'
\1', text)
# Remove External links
text = re.sub(r'\[\w+:/*[^][<>\s"]* *(.*?)\]', r'\1', text)
# Get "readable prose"
prose = re.compile('|'.join((
r'\{\|.*?\|\}',
r'
[/]*>.*?]',
r'\{\{.*?\}\}',
r'^=+[^\n]+=+ *$',
r'^ *\*[^\n]*$',
r'^ *\#[^\n]*$',
r'^ [^\n]+$',
r'^;+[^\n]*$',
#r'^:+[^\n]{1,200}$', # can do full definition list
r"^:+ *
\w[^\n]*$", # remove dab like lines
r'&[^;]{2,6};')), re.IGNORECASE | re.DOTALL | re.MULTILINE | re.UNICODE).sub(r'', text)
# Remove HTML
text = re.sub(r'?\w+[^<>]*>', r'', text)
prose= re.sub(r'?\w+[^<>]*>', r'', prose)
textstats = ReadingNumbers()
textstats.populateStats(text)
textstats.makeIndexes()
prosestats = ReadingNumbers()
prosestats.populateStats(prose)
prosestats.makeIndexes()
print ''
print ' | Text | Proses |
'
print '| Counts |
'
rowPrint('Words', textstats.words, prosestats.words)
rowPrint('Sentences', textstats.sentences, prosestats.sentences)
rowPrint('Letters', textstats.letters, prosestats.letters)
rowPrint('Syllables', textstats.syllables, prosestats.syllables)
rowPrint('Polysyllabic words', textstats.HardWords, prosestats.HardWords)
#print '
'#'''
#
#print ''
#print ' | Plain text | Prose |
'
print '| Averages |
'
rowPrint('Characters per word','%.2f'%(float(textstats.letters)/float(textstats.words)))
rowPrint('Syllables per word', '%.2f'%textstats.avgSyllables, '%.2f'%prosestats.avgSyllables)
rowPrint('Words per sentence', '%.2f'%(textstats.words/float(textstats.sentences)), '%.2f'%(float(prosestats.words)/float(prosestats.sentences)))
print '
'#'''
# Size Comparisions
print '| '
print 'Wikitext: %3.3g KB' % (len(wikiText)/1024.)
print '' % (len(wikiText)/350)
print 'Text: %3.3g KB' % (len(text)/1024.)
print '' % (len(text)/350)
print 'Proses: %3.3g KB' % (len(prose)/1024.)
print '' % (len(prose)/350)
print ' |
'
# Clear
print ''
# Fry Graph
fry_x = ((prosestats.avgSyllables*100)-108)*22/2+42
x = 100*float(prosestats.sentences)/float(prosestats.words)
fry_y = -0.005*x**4 + 0.3764*x**3 - 10.611*x**2 + 136.93*x - 245.68
print ''
#print ''
#print '  '
print ' '
print '  '
print ' ' % ((fry_x-4)/1.9275, (497-fry_y-4)/1.9275, )
print '  '
print ' '
print ' |
'
# Readability Indexes (print)
print ''
print 'Proses readability'
print '| Test | U.S. grade level | Reader age | Scale |
'
rowPrint('[[Flesch Reading Ease]]', '', '', '%.1f' % prosestats.Flesch_Ease)
rowPrint('[[Automated Readability Index]] (ARI)','%.2f'%prosestats.ARI, grade2age(prosestats.ARI))
rowPrint('[[Flesch-Kincaid]]', '%.2f'%prosestats.Flesch_Kincaid, grade2age(prosestats.Flesch_Kincaid))
rowPrint('[[Coleman-Liau]]', '%.2f'%prosestats.Cloeman_Liau, grade2age(prosestats.Cloeman_Liau))
rowPrint('[[Gunning Fog]]', '%.2f'%prosestats.Gunning_fog, grade2age(prosestats.Gunning_fog))
rowPrint('[[SMOG]]', '%.2f'%prosestats.SMOG, grade2age(prosestats.SMOG))
# rowPrint('[[Laesbarhedsindex]] (LIX)', '', '')
# rowPrint('[[Linsear Write]]', '', '')
# rowPrint('[[Fry Readability Formula|Fry]]','', '')
print ' | |
'
print '
'
print ' '
# Suggestion
if textstats.words < 300:
suggest = ArticleWordStub
elif textstats.words > 10000:
suggest = ArticleWordLarge
elif textstats.SMOG <= 8:
suggest = ArticleLowIndex
elif textstats.SMOG >= 16:
suggest = ArticleHighIndex
elif len(text) > 75e3:
suggest = ArticleSizeLarge
elif len(text) > 50e3:
suggest = ArticleSizeBig
else:
suggest = None
if suggest:
print ''
print '- Notes
'
print '- %s
' % wikipedia.translate(page.site().language(), suggest)
print '
'
print wikiLink('''
- The follow text is parsed text, plain text, and "[[Wikipedia:Article length#What is and is not included as .22readable prose.22|Readable Prose]]".
- Underlined words has 3 or more syllables.
- Highlighed words indicate puncuation.
- Redistributed under the terms of the [[Project:GFDL|GNU Free Documentation License]]
- Information provided here is not accurate ;-)
''')
print ''
print 'Wikitext'
printWiki(wikiText, basehref="?page=%s:"%page.site().language(), linkClass="")
print ' | '
print 'Text'
printWiki(text, linkClass="")
print ' | '
print 'Proses (Marked up)'
printWiki(prosestats.markedup, linkClass="")
print ' |
'
# printu('%s' % text)
def main():
form = cgi.FieldStorage()
global debug
debug = bool(form.getfirst('debug', False))
# host = form.getfirst('hostname', 'en.wikipedia.org').split('.')
# lang = form.getfirst('lang', host[0])
# family = form.getfirst('family', host[1])
site = wikipedia.getSite()
lang = form.getfirst('lang', site.lang)
family = form.getfirst('family', site.family)
site = wikipedia.Site(lang, family)
page = wikipedia.Page(site, form.getfirst('page', ''))
global hostname
hostname= page.hostname()
wikipedia.startContent(u'Readability for %s' % page.title(), form=False)
print ''
if page.title():
doStats(page)
if __name__ == "__main__" and wikipedia.handleUrlAndHeader():
try:
main()
finally:
wikipedia.endContent()
wikipedia.stopme()