#!/usr/bin/env python # -*- coding: utf-8 -*- """ ¶ms; Syntax: python checklinks.py Options are: -mintime: The default 0 -maxtime: The default infinity -limit: The number of pages to process (default: 5000) -list: mfe e.g. ./checklinks.py -mintime:$((60*60*24*90)) -subcat:Human_spaceflights See also: pagegenerators.py Dispenser (c) 2009 """ import re, time, codecs, sys, os import wikipedia, pagegenerators import checklink def cacheAge(page): # time when cache was updated (in seconds) try: return time.time() - os.path.getmtime('/home/dispenser/'+'public_html/cgi-bin/cache/'+page.site().sitename().encode('ascii')+':'+page.title(underscore=True).replace('/', '|').encode('utf-8')) except OSError: return float('inf') def listgen(filename, limit = 5000): for line in open(filename, 'r'): yield line.split('\t')[0:3:2] def reportgen(reportname): f = open('/home/dispenser/'+'public_html/cgi-bin/reports/'+reportname.replace('/', '|'), 'r') for reportline in f: # sitename() \t page.title() \n if reportline[:reportline.index('\t')] == wikipedia.getSite().sitename(): yield wikipedia.Page(wikipedia.getSite(), unicode(reportline[reportline.index('\t')+1:-1], 'utf-8')) def main(): site = wikipedia.getSite() reports = None gen = None # Limits limit = None totalpages = 0 starttime = time.time() # minium time before updating in seconds mintime = 24 * 60 * 60 maxtime = float('inf') for arg in wikipedia.handleArgs(): if arg.startswith('-list:'): reports = listgen('/home/dispenser/public_html/cgi-bin/list/'+arg[6:].replace(' ', '_')) # old revisions code elif arg.startswith('-limit:'): limit = int(arg[7:]) elif arg.startswith('-mintime:'): mintime = float(arg[9:]) elif arg.startswith('-maxtime:'): maxtime = float(arg[9:]) elif arg.startswith('-report:'): reports = [(arg[8:], totalpages, )] elif arg.startswith('-totalpages:'): totalpages = int(arg[12:]) else: generator = pagegenerators.GeneratorFactory().handleArg(arg) if generator: gen = gen or generator else: wikipedia.output('Unknown argument: %s' % arg) # Setup limit if not limit: limit = int( 2.0 * (mintime/3600.0/24.0 + 10.0)) wikipedia.output('Page limit unspecified; setting to %s'%limit) ## Limits Explained ####################### # period rate max cat size # 1 22 22 # 7 34 238 # 14 48 672 # 30 80 2400 # 90 200 18000 # Also time limit of 4 hours (cumulative) # If totalpages is specified re-caching # is limited to average per day def ChecklinksOnGenerator(gen, limit = 500, totalpages = 0): reportstart=time.time() counter = 0 recaching = 0 for page in gen: age = cacheAge(page) agefmt = age < float('inf') and 'cache%3dd %2dh old' % (age/60.0/60.0/24.0, age/60.0/60.0%24.0,) or 'never cached' if not (mintime < age <= maxtime): wikipedia.output('Skipping page %-46s (%s)' % (page.aslink(), agefmt,)) continue elif totalpages > 0 and age < float('inf'): if recaching > totalpages / (mintime/60.0/60.0/24.0): wikipedia.output('Overflow page %-46s (%s)' % (page.aslink(), agefmt,)) continue else: recaching += 1 # try: if page.isRedirectPage(): # uses page.get() wikipedia.output('%s is a redirect' % page.aslink()) elif not page.exists(): wikipedia.output('%s does not exist' % page.aslink()) else: wikipedia.output('Getting page %-47s (%s)' % (page.aslink(), agefmt,)) # Reconfigure for longer times checklink.DEBUG = True checklink.SOURCE = False checklink.printu = wikipedia.output checklink.config.defaulttimeout = 300 checklink.config.max_external_links = 8 checklink.config.useGET = False checklink.config.threaded = True checklink.checkMediaWikiPage(page) counter += 1 if counter >= limit : wikipedia.output('Stopping at %d pages' % counter) break # We have about 6 reports to do, 24/6 = 4 hours if time.time() - starttime > 60 * 60 * 4: wikipedia.output('Stopping, run time exceeded 4 hours. ') break except KeyboardInterrupt: break except Exception, e: wikipedia.output('Python exception: %r'%e) raise wikipedia.output('Re-cached %d pages in %#4.2f minutes (%d s culmative CPU)' % (counter, (time.time()-reportstart)/60.0, time.clock())) wikipedia.output('') if reports: for (reportname, totalpages) in reports: wikipedia.output('Opening report: %r'%reportname) ChecklinksOnGenerator( reportgen(reportname), limit=limit, totalpages=int(totalpages)) elif gen: wikipedia.output(' '.join(list(wikipedia.handleArgs()))) wikipedia.output('Using generator %r' % gen ) ChecklinksOnGenerator(gen, limit = limit) else: wikipedia.showHelp('checklinks') if __name__ == "__main__": try: main() finally: wikipedia.stopme()