#!/usr/bin/python # -*- coding: utf-8 -*- """ Syntax: python checklinks.py Options are: -namespace: Specify the namespace to scan (default: allpages) -threaded: Turn on/off threading (default:yes) -output: File to direct ouput to -limit: The number of pages to process (default: 5000) -shift: Switch page to talk or article spaces [-1|0 (default)|1] -FAs_month Checks this months featured status page (removed) -r: Retrive links using a regular expression on the HTML output of a page -daemon Continually checking for jobs in queue file -runonce Quits once the job queue is empty (only in daemon mode) -autoselect Examins html for codes on howto generate the list to parse -convert -void Stops parsing options -httpmethod The method that which the bot retrives links. Options are HEAD (default) and GET These parameters are supported to specify which pages titles to print in addition to those in pagegenerators: -page: Only check a specific page. Argument can also be given as "-page:pagetitle". See also: pagegenerators.py (c) 2007-2009 Dispenser """ import re, time, codecs, sys, os import wikipedia, pagegenerators import checklink def tsvLog(*items): """ .tsv safe string format """ return "\t".join([unicode(x).replace('\n', '\\n').replace('\r', '\\r').expandtabs() for x in items]) def cacheAge(page): # time when cache was updated (in seconds) try: return time.time() - os.path.getmtime('/home/dispenser/'+'public_html/cgi-bin/cache/'+page.site().sitename()+':'+page.title(underscore=True).replace('/', '|').encode('utf-8')) except OSError: return float('inf') def listgen(filename, limit = 5000): for line in open(filename, 'r'): yield line[:line.index('\t')] def reportgen(reportname): f = open('/home/dispenser/'+'public_html/cgi-bin/reports/'+reportname.replace('/', '|'), 'r') for reportline in f: # %s\t%s\n yield wikipedia.Page(wikipedia.getSite(), unicode(reportline[reportline.index('\t')+1:-1], 'utf-8')) def main(): site = wikipedia.getSite() reports = None gen = None # Prevent overlapping queues limit = 100 # minium time before updating mintime = 24 * 60 * 60 for arg in wikipedia.handleArgs(): #if arg.startswith('-threaded:'): # threaded = bool(arg[10:]) # wikipedia.output('Threading is %s' % (threaded and 'enabled' or 'disabled')) # file output/lib settings #elif arg.startswith('-httpmethod:'): # alwaysUseGet = ('GET' in arg[12:].upper()) if arg.startswith('-mintime:'): mintime = int(arg[9:]) elif arg.startswith('-list:'): reports = listgen('/home/dispenser/public_html/cgi-bin/jobs/'+arg[6:].lower().replace(' ', '_'), limit) # old revisions code elif arg.startswith('-limit:'): limit = int(arg[7:]) else: generator = pagegenerators.GeneratorFactory().handleArg(arg) if generator: gen = gen or generator else: wikipedia.output('Unknown argument: %s' % arg) def ChecklinksOnGenerator(gen, limit = 500): def noEntry(*arg): pass counter = 0 for page in gen: if cacheAge(page) < mintime: wikipedia.output('Skipping page %s (cache %d d %d h old)' % (page.aslink(), cacheAge(page)/60.0/60.0/24.0, cacheAge(page)/60.0/60.0%24.0)) continue # else try: if page.isRedirectPage(): # uses page.get() wikipedia.output('%s is a redirect' % page.aslink()) else: wikipedia.output('Getting page %s' % page.aslink()) # Reconfigure for longer times checklink.DEBUG = True checklink.printu = wikipedia.output checklink.config.defaulttimeout = 300 checklink.config.max_external_links = 20 alwaysUseGet = False threaded = True checklink.checkMediaWikiPage(page, noEntry, alwaysUseGet, threaded) counter += 1 if counter >= limit : wikipedia.output('Stopping at %d pages' % counter) break except KeyboardInterrupt: break except Exception, e: wikipedia.output('Python exception: %r'%e) raise wikipedia.output('Re-cached %d pages in %#4.2f minutes (%d s culmative CPU)' % (counter, (time.time()-starttime)/60.0, time.clock())) wikipedia.output('') if not gen and not reports: wikipedia.showHelp('checklinks') else: global starttime starttime=0 if reports: starttime=time.time() for reportname in reports: wikipedia.output('Opening report: %r'%reportname) ChecklinksOnGenerator( reportgen(reportname), limit=limit) if gen: print 'if gen:' ChecklinksOnGenerator(gen, limit = 20) if __name__ == "__main__": try: main() finally: wikipedia.stopme()