#!/usr/bin/env python # -*- coding: utf-8 -*- ''' webchecklinks.py A web wrapper around the checklinks library (c) Dispenser, 2007-2008 ''' import cgi, sys import checklink, wikipedia # Don't log since error generally happen in a thread and don't go through the normal handler import cgitb; cgitb.enable() def printu(ustr): try: print ustr except: print ustr.encode('utf-8') def printFile(name, arg1="$1"): f = open('./text/%s.html' % name ) print f.read().replace('$1', arg1.encode('utf-8')) f.close() def printEntry(self, page, url, refId, context, status, reason, redirect, rank, comment): #printu(u'%s %s - %s' % (status, reason, wikilink)) classes = 'dead-%s'%rank if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'): classes += ' webcite' if redirect: classes += ' redirect' printu('%s%s%s%s' % (classes, refId or '', context.replace('[[', '[['), reason, status, rank and (comment or reason) or '')) # flush to user sys.stdout.flush() def textbox(name, value, label = None, attrib=''): if label is None: label = '%s: ' % name.capitalize() print '' % (name, label, name, value, name, attrib) def checkbox(name, checked, label = None, attr=''): if checked: attr += ' checked="checked"' print '' % (name, name, attr, name, label or name.capitalize()) def main(): #global hostname form = cgi.FieldStorage() htmlmode = bool(form.getfirst('html', False)) checklink.DEBUG = bool(form.getfirst('debug', False)) checklink.SOURCE = bool(form.getfirst('source', False)) checklink.config.max_external_links = int(form.getfirst('threads', 30)) checklink.config.defaulttimeout = int(form.getfirst('timeout', 30)) checklink.config.httpDebug = int(form.getfirst('httpDebug', 0)) checklink.config.useGET = bool(form.getfirst('alwaysUseGet', False)) checklink.config.threaded = not bool(form.getfirst('nothread', False)) if form.getfirst('hostname'): host = form.getfirst('hostname').split('.') site = wikipedia.Site(host[0], host[1]) else: site = wikipedia.getSite() page = wikipedia.Page(site, form.getfirst('page', '')) #hostname = page.hostname() wikipedia.startContent(u'Checklinks: %s' % page.title(), form=False, head='''''') print '
' print '' % page.aslink()[2:-2].encode('utf-8') if checklink.DEBUG: print '
' textbox('httpDebug', checklink.config.httpDebug, 'HTTP debug level: ', ' size="3"') textbox('threads', checklink.config.max_external_links, attrib=' size="2"') textbox('timeout', checklink.config.defaulttimeout, attrib=' size="3"') print '
' checkbox('alwaysUseGet', checklink.config.useGET, 'Always download') checkbox('debug', checklink.DEBUG, 'Debug') checkbox('nothread', not checklink.config.threaded, 'Disable threading') checkbox('html', htmlmode, 'HTML input') checkbox('source', checklink.SOURCE, 'Print source') print '
' try: page.get() except wikipedia.NoPage, e: printu('NoPage error encountered (%s)' % e) return except wikipedia.IsRedirectPage: link = wikipedia.Page(page.site(), page._redirarg).aslink()[2:-2] printu('#REDIRECT %s' % (page.hostname(), link, link)) return printFile('checklinks-header') if checklink.cacheAge(page) < float('inf'): print checklink.time.strftime('

Previously cached on %d %B %Y at %H:%M

', checklink.time.gmtime(checklink.time.time() - checklink.cacheAge(page))) else: print "" print '' #import parser #printu('' % (parser.escapeId(page.title().encode('utf-8')), page.hostname(), page.urlname(), page.aslink()[2:-2], page.title())) printu('' % ('', page.hostname(), page.urlname(), page.aslink()[2:-2], page.title())) printFile('checklinks-tableHead') try: if htmlmode: checklink.checkMWhtml(page, printEntry) else: checklink.checkMediaWikiPage(page, printEntry) finally: print '
%s
%s
' if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: main() finally: wikipedia.endContent()