#!/usr/bin/env python # -*- coding: utf-8 -*- ''' webchecklinks.py A web wrapper around the checklinks library (c) Dispenser, 2007-2008 ''' import cgi, re import checklink, wikipedia, parser import cgitb; cgitb.enable(logdir='./logs/', context=10) def printu(ustr): try: print ustr except: print ustr.encode('utf-8') def printFile(name, arg1="$1"): f = open('./text/%s.html' % name ) print f.read().replace('$1', arg1.encode('utf-8')) f.close() def addEntry(self, page, url, refId, context, status, reason, redirect, rank, comment): #printu(u'%s %s - %s' % (status, reason, wikilink)) if not rank < threshold: classes = 'dead-%s'%rank if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'): classes += ' webcite' if redirect: classes += ' redirect' printu('%s%s%s%s' % (classes, refId or '', context.replace('[[', '[['), reason, status, rank and (comment or reason) or '')) def textbox(name, value, label = None, attrib=''): if label is None: label = '%s: ' % name.capitalize() print '' % (name, label, name, value, name, attrib) def checkbox(name, checked, label = None, attr=''): if checked: attr += ' checked="checked"' print '' % (name, name, attr, name, label or name.capitalize()) def main(): global threshold #global hostname form = cgi.FieldStorage() checklink.DEBUG = bool(form.getfirst('debug', False)) checklink.SOURCE = bool(form.getfirst('source', False)) checklink.config.max_external_links = int(form.getfirst('threads', 30)) checklink.config.defaulttimeout = int(form.getfirst('timeout', 30)) checklink.config.httpDebug = int(form.getfirst('httpDebug', 0)) alwaysUseGet = bool(form.getfirst('alwaysUseGet', False)) htmlmode = bool(form.getfirst('html', False)) nothread = bool(form.getfirst('nothread', False)) threshold = int( form.getfirst('threshold', 0)) if form.getfirst('hostname'): host = form.getfirst('hostname').split('.') site = wikipedia.Site(host[0], host[1]) else: site = wikipedia.getSite() page = wikipedia.Page(site, form.getfirst('page', '')) #hostname = page.hostname() wikipedia.startContent(u'Checklinks: %s' % page.title(), form=False, notice='') print '
' print '' % page.aslink()[2:-2].encode('utf-8') if checklink.DEBUG: print '
' textbox('threshold', threshold, attrib=' size="1"') textbox('threads', checklink.config.max_external_links, attrib=' size="2"') textbox('timeout', checklink.config.defaulttimeout, attrib=' size="3"') textbox('httpDebug', checklink.config.httpDebug, 'HTTP debug level: ', ' size="3"') print '
' checkbox('alwaysUseGet', alwaysUseGet, 'Force downloading') checkbox('debug', checklink.DEBUG, 'Debug') checkbox('html', htmlmode, 'HTML input') checkbox('nothread', nothread, 'Disable threading') checkbox('source', checklink.SOURCE, 'Print source') print '
' try: page.get() except wikipedia.NoPage, e: printu('NoPage error encountered (%s)' % e) return except wikipedia.IsRedirectPage: link = wikipedia.Page(page.site(), page._redirarg).aslink()[2:-2] printu('#REDIRECT %s' % (page.hostname(), link, link)) return try: printFile('ChecklinksHeader') print '' printu('' % (parser.escapeId(page.title().encode('utf-8')), page.hostname(), page.urlname(), page.aslink()[2:-2], page.title())) printFile('tableHeader') if htmlmode: checklink.checkMWhtml(page, addEntry, alwaysUseGet, not nothread) else: checklink.checkMediaWikiPage(page, addEntry, alwaysUseGet, not nothread) finally: print '
%s
' if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: main() finally: wikipedia.endContent()