#!/usr/bin/python # -*- coding: utf-8 -*- ''' Linkcheck.py Web interface for displaying mangagmenting pages related to checklinks.py ''' import cgi, os, re, time import parser#, wikipedia # HTML debugging import cgitb; cgitb.enable(logdir='./logs/') # Python on Solaris handle SIGPIPE different than it does under bash linux import signal; signal.signal(signal.SIGPIPE,signal.SIG_DFL) root = "/~dispenser"; docroot= "/home/~dispenser/public_html/cgi-bin/text"; def print_tsnotice(): try: notice = open('/var/www/sitenotice', 'r').read() if notice: print '
%s
'%notice except IOError: pass def timeago(sec): if sec < 60: return '%.2g seconds' % (sec) elif sec < 3600: return '%.2g minutes' % (sec/60.) elif sec < 86400: return '%.2g hours' % (sec/3600.) elif sec < 604800: return '%.2g days' % (sec/86400.) elif sec < 2419200: return '%.2g weeks' % (sec/604800.) else: return '%.2g months' % (sec/2419200.) return def printWikiFile(name): try: f = open('./text/%s.html' % name.replace('/', '|')) print parser.parser(f.read(), allowComments=True, allowHtml=True, sanitize=False) f.close() except IOError: print '

%s.html does not exist.

' % name def printFile(name, arg1="$1", arg2="$2"): # never allow user input for name! f = open('./text/%s.html' % name) print f.read().replace('$1', arg1).replace('$2', arg2) f.close() def startContent(title='Link checker'): printFile('head', title, ('/' in urlname and '' or '') + '') print '' print '

%s

' % title print '
' print '
' print """
Views
    """ for tab in viewtabs: print """%s""" % (len(tab)>2 and ' class="selected"' or '', tab[0], tab[1], ) print """\
""" print """
""" print_tsnotice() printFile('notice') print '
' ''' print """
""" # printWikiFile('siteNotice') print """

""" ''' def endContent(): print """
""" def wikiParser(text): return parser.parser(text) def printHelpLinks(): print ''' ''' def printReportSummary(filename): try: modifiedtime = os.path.getmtime('./reports/'+filename.replace('/', '|')) global notes #notes = ('Updated %s ago on %s (UTC)' % (timeago(time.time() - modifiedtime), time.ctime(modifiedtime), ),) #time.strftime('%A, %Y %d, %B at %H:%M (UTC)', time.gmtime()) notes = ('This page was last modified on %s' % time.ctime(modifiedtime),) except OSError: printWikiFile('doesnotexist') return pagecount = 0 rankstats = {'0':0,'1':0,'2':0,'3':0,'4':0,'5':0,'6':0} barelinks = {} counter = 0 print '' print '' for page in open('./reports/%s' % filename.replace('/', '|')): pagecount += 1 pagename = page.strip('\n').split('\t')[1] laststats = {'0':0,'1':0,'2':0,'3':0,'4':0,'5':0,'6':0} try: for linkentry in open('./cache/'+page.replace('\t', ':').replace(' ', '_').replace('/', '|').strip('\n')): items = linkentry.split('\t') # print '', items, '' rankstats[items[8]] = rankstats.get(items[8], 0) + 1 laststats[items[8]] = laststats.get(items[8], 0) + 1 if '"external autonumber"' in linkentry: barelinks[pagename] = barelinks.get(pagename, 0) + 1 except IOError: print ''%pagename continue if laststats['4'] or laststats['5']: counter+= 1 print '' print wikiParser('' % (pagename.replace('|', '/'), )) print ''%(pagename,'/~dispenser/cgi-bin/webchecklinks.py/', pagename) for num in (laststats['4'], laststats['5']): print num and ''%num or '' print '' print '' % (counter, pagecount) print '
ArticleToolsSuspiciousDead
WARNING: %s is not cached
[[%s]]view, refresh% 3d
Showing %d out of %d pages
' # Blank file, divide by zero errors if pagecount == 0: return linkcount = sum(rankstats.values()) print """
Overview
LinksAvg per pagePercent total
Good: %6d %5.3g %5.1f%%
Status: %6d %5.3g %5.1f%%
Warn: %6d %5.3g %5.1f%%
Suspicious:%6d %5.3g %5.1f%%
Error: %6d %5.3g %5.1f%%
Conection: %6d %5.3g %5.1f%%
Badlinks: %6d -
Total: %6d %#.3g %2.2f%%
""" % ( ','.join(["%d"%(rankstats[c]*100/linkcount+0.5) for c in '012345']), 3.14159-(3.14159*rankstats['0']/linkcount), rankstats['0'], (rankstats['0']/float(pagecount)), (rankstats['0']*100/float(linkcount)), rankstats['1'], (rankstats['1']/float(pagecount)), (rankstats['1']*100/float(linkcount)), rankstats['2'], (rankstats['2']/float(pagecount)), (rankstats['2']*100/float(linkcount)), rankstats['4'], (rankstats['4']/float(pagecount)), (rankstats['4']*100/float(linkcount)), rankstats['5'], (rankstats['5']/float(pagecount)), (rankstats['5']*100/float(linkcount)), rankstats['3'], (rankstats['3']/float(pagecount)), (rankstats['3']*100/float(linkcount)), rankstats['6'], linkcount, (linkcount/float(pagecount)), 100, ) if barelinks: print parser.parser("

These pages appear to have bare links: "+', '.join(sorted(["[[%s]] (%d)"%(k,v) for k,v in barelinks.items()]))+"

") def printLinksSummary(filename): if not os.path.exists('./jobs/%s.tsv' % filename.replace('/', '|')): printWikiFile('doesnotexist') return broken = {} onthefly = '/~dispenser/cgi-bin/webchecklinks.py?page=' Modify_time = os.stat('./jobs/%s.tsv' % filename.replace('/', '|'))[9] print '
Updated %s ago on %s (UTC)
' % (timeago(time.time() - Modify_time), time.ctime(Modify_time), ) #time.strftime('%A, %Y %d, %B at %H:%M (UTC)', time.gmtime()) f = open('./jobs/%s.tsv' % filename.replace('/', '|')) lastpage = '' pagecount = 0 rankstats = [0, 0, 0, 0, 0, 0, 0] # codestats = {} barelinks = {} for line in f: if not line or line.count('\t') < 7:# for whatever reason we always \n as the last item 6: continue elif line.startswith('#'): # Print file comments if not lastpage: print wikiParser(line[1:]) continue # Old format: (pagename, wikilink, status, reason, content_length, content_type, rank, comments) # New format: (pagename, wikilink, status, reason, metadata, Ref No., rank, comment v = line.split('\t') if 6 > int(v[6]) >= 4: if not broken.has_key(v[0]): broken[v[0]] = [0, 0] broken[v[0]][int(v[6])-4] += 1 if v[0] != lastpage: lastpage = v[0] pagecount += 1 # Geather statistics rankstats[int(v[6])] += 1 # if v[2] not in codestats:codestats[v[2]] = 0 # codestats[v[2]] += 1 if v[4] == '{}' and (v[1].startswith('http://') or ' ' not in v[1] and v[5]): if v[0] in barelinks: barelinks[v[0]] += 1 else: barelinks[v[0]] = 1 f.close() print '' print '' counter = 0 for key in broken.iterkeys(): pagename = key[2:key.find(']')] anchorname = parser.escapeId(pagename) print wikiParser('' % (key, filename, anchorname, )) for num in broken[key]: print num and ''%num or '' print '' % (pagename, pagename, pagename) counter += 1 print '' % (counter, pagecount) print '
ArticleLinks tabSuspiciousDeadTools
%sView% 3d Checklinks (cache), Dablinks
Show %d out of %d pages
' # print '
' # colormap = {'-':'ccccff','1':'ABCEDF', '2':'CCFCCC', '3':'CCE5F2', '4':'FC9999', '5':'E5CCB3'} # if '200' in codestats: # total = float(sum(codestats.values())) # print '

The right graph excludes "200 OK" responses, thus representing only %d%% of all responses.

' % (100.5-100*codestats['200']/total) # print '' % (3.14159-codestats['200']*3.14159/total, codestats['200']*100/total, 100-(codestats['200']*100/total), codestats['200']*100/total+0.5,) # del codestats['200'] # # # group values below 1% into "Other" # total = sum(codestats.values()) # for (key, value) in codestats.items(): # if value < total / 100 and not key[0]=='4': # codestats['Other'] = value + codestats.get('Other', 0) # del codestats[key] # print """%r"""%( # codestats, # '|'.join([colormap.get(s[0], 'aaaaaa') for s in sorted(codestats.keys())]), # ','.join(["%s"%(codestats[s]*100/total) for s in sorted(codestats.keys())]), # '|'.join(sorted(codestats.keys())), # ) # print '
' if sum(rankstats) > 0:print """
LinksAvg per pagePercent total
Good: %6d %5.3g %5.1f%%
Status: %6d %5.3g %5.1f%%
Warn: %6d %5.3g %5.1f%%
Suspicious:%6d %5.3g %5.1f%%
Error: %6d %5.3g %5.1f%%
Conection: %6d %5.3g %5.1f%%
Badlinks: %6d -
Total: %6d %#.3g %2.2f%%
""" % ( ','.join(["%d"%(i*100/sum(rankstats)+0.5) for i in rankstats]), 3.14159-(3.14159*rankstats[0]/sum(rankstats)), rankstats[0], (rankstats[0]/float(pagecount)), (rankstats[0]*100/float(sum(rankstats))), rankstats[1], (rankstats[1]/float(pagecount)), (rankstats[1]*100/float(sum(rankstats))), rankstats[2], (rankstats[2]/float(pagecount)), (rankstats[2]*100/float(sum(rankstats))), rankstats[4], (rankstats[4]/float(pagecount)), (rankstats[4]*100/float(sum(rankstats))), rankstats[5], (rankstats[5]/float(pagecount)), (rankstats[5]*100/float(sum(rankstats))), rankstats[3], (rankstats[3]/float(pagecount)), (rankstats[3]*100/float(sum(rankstats))), rankstats[6], sum(rankstats), (sum(rankstats)/float(pagecount)), 100, ) if barelinks: print parser.parser("These pages appear to have bare links: "+', '.join(sorted(["%s (%d)"%(k,v) for k,v in barelinks.items()]))) def printLinksPage(filename, threshold='1', code=None): if not os.path.exists('./jobs/%s.tsv' % filename.replace('/', '|')): printWikiFile('doesnotexist') return printFile('tableHeader') lastpage = '' f = open('./jobs/%s.tsv' % filename.replace('/', '|')) try: for line in f: if line.startswith('#'): continue #(pagename, wikilink, status, reason, content_length, content_type, rank, comments) = line.split('\t') v = line.split('\t') if not len(v) > 6 or v[6] < threshold: continue if code and v[2] != code: continue if v[0] != lastpage: lastpage = v[0] anchor = parser.escapeId(lastpage[2:lastpage.find(']')]) print wikiParser('%s' % (anchor, lastpage)) # truncate response reason if too long reason = v[3] if len(reason) >= 37: # "Temporary failure in name resolution" is the longest at 36 chars reason = reason[:25] + '...' try: # Not very safe! meta = ('
'.join(['='.join(tuple) for tuple in eval(v[4]).items()])).encode('utf-8') except: meta = '' print parser.parser('\t–%s%s%s%s%s' % (v[6], '', meta.replace('[[', '[['), v[1], reason, v[2], v[7] or reason)) #print ('\t%s%s%s%s' % (v[6], v[2], reason, v[1], v[7])) finally: print '' f.close() def main(): form = cgi.FieldStorage() action = form.getfirst('action', 'view') threshold = form.getfirst('threshold', '1') global urlname # used to set base urlname = form.getfirst('title', 'Main_Page') title = urlname.replace('_', ' ') filename = urlname.replace('/', '|') #+ '.tsv' # Previously we used '-' to represent '/', now we use '|' #filename = filename.replace('/', '-') global viewtabs viewtabs = [("#", "Special page", True)] global notes notes = () # Wikitext files if action == 'view': viewtabs = [ ("/~dispenser/view/%s" % urlname, "Page", True), ("/~dispenser/source/%s"%urlname, "View source"), ] startContent(title) printWikiFile(urlname) try: modifiedtime = os.path.getmtime('./text/%s.html'%filename) notes = ('This page was last modified on %s' % time.ctime(modifiedtime),) except OSError: pass elif action == 'source' or action == 'edit': viewtabs = [ ("/~dispenser/view/%s" % urlname, "Page"), ("/~dispenser/source/%s"%urlname, "View source", True), ] startContent('View source: %s' % title) try: f = open('./text/%s.html'%filename) modifiedtime = os.stat('./text/%s.html' % filename)[9] notes = ('Updated %s ago on %s (UTC)' % (timeago(time.time() - modifiedtime), time.ctime(modifiedtime), ),) #time.strftime('%A, %Y %d, %B at %H:%M (UTC)', time.gmtime()) print '
You can view and copy the source of this page:
' print '' except IOError: print '

%s.html does not exist.

' % urlname # Link File Tables elif action == 'links' or action == 'table': viewtabs = [ ("/~dispenser/summary/%s" % urlname, "Summary"), ("/~dispenser/links/%s" % urlname, "Links", True), # ("/~dispenser/job/%stsv" % filename, "Download (.tsv)"), ] startContent("Link results from %s" % title) printHelpLinks() printLinksPage(urlname, threshold, form.getfirst('code')) elif action == 'cache': startContent("Cache of %s" % title) requestname = filename.replace('/', '|') if not ':' in requestname: requestname="wikipedia:en:"+requestname try: f = open('./cache/%s' % requestname) print """ """ printFile('checklinks-cache-warning') print 'refresh' % (requestname.replace('wikipedia:', 'w:')) print '' print '' % (requestname.replace('wikipedia:', 'w:'), requestname.replace('wikipedia:en:', ''), requestname) #print '' % (parser.escapeId(page.title().encode('utf-8')), page.hostname(), page.urlname(), page.aslink()[2:-2], page.title()) printFile('tableHeader') for line in f: cells = line.split('\t') print ''%cells[8] print ""%cells[1] print ""%cells[2] print ''%(cells[4],cells[3]) print ""%cells[9] print '' print '
%s
%s
%s%s%s%s
' except IOError: print 'No cached result exist of %s' % (requestname, requestname) #raise viewtabs = [ ("/~dispenser/cache/%s" % requestname, "Cache", True), ] elif action == 'report' or action=='summary': viewtabs = [ ("/~dispenser/summary/%s" % urlname, "Summary", True), ] startContent("Summary of %s" % title) printHelpLinks() printReportSummary(urlname) elif action == 'oldsummary': startContent("Summary of %s" % title) printHelpLinks() printLinksSummary(urlname) viewtabs = [ ("/~dispenser/summary/%s" % urlname, "Summary", True), ("/~dispenser/links/%s" % urlname, "Links"), ] # Queue list elif action == 'list': viewtabs = [ ("/~dispenser/list/%s" % urlname, "Jobs", True), ] startContent('%s' % title) jobs = file('./jobs/%s'% urlname.lower()) if ' ' in title: print wikiParser("''See also: [[Category:%s]]''"%title) print '' for s in jobs: if not s.strip(): continue cmd = s.strip('\n\r').split('\t') for item in cmd[1:]: if item.startswith('-list:jobs/'): cmd.remove(item) while len(cmd) <= 3:cmd.append('') try:cmd.append(time.strftime("%d %B %Y at %H:%M", time.gmtime(os.path.getmtime("./reports/%s"%cmd[0])))) except OSError:pass print '' % (cmd[0].replace('|', '/'), cmd[1], '
%s%s
'.join(cmd[2:])) print '
' else: startContent() print 'No action by that name specified' endContent() printFile('nav') printFile('footer', '
'.join(notes + ("Page rendering in %#3.2f seconds" % time.clock(),))) print '' def handleUrlAndHeader(): redirect = os.getenv("REQUEST_URI", '') redirect = redirect.replace('%20', '_').replace('+', '_') redirect = redirect.rstrip('_') if redirect != os.getenv("REQUEST_URI", ''): # http://turbo-technical-report.blogspot.com/2006/11/server-side-301-302-http-response.html # Saved me quite a bit of trouble. Thanks! print "Status: 301" print 'Location: ' + redirect print 'Content-Type: text/html; charset=utf-8' print print """ 301 Moved Permanently

Moved Permanently

The document has moved here.

""" % redirect return False else: #print 'Content-Type: application/xhtml+xml; charset=utf-8' print 'Content-Type: text/html; charset=utf-8' print return True if __name__ == "__main__" and handleUrlAndHeader(): try: print '' print '' main() finally: print ''