# This is hack together sub function for pagegenerators import urllib, wikipedia parameterHelp = ''' -cat: Work on pages in a specific category -headlinks: Works on all links that are in heading tags (== [[link]] ==) -links: Works on all links on the given page -new Works on the 10 newest pages -page: name of the page you want to work on -prefixindex: Works on all pages with the given prefix -ref: Work on all pages linking to the given page -subcat: -titles: -weblink: Works on all pages with the given external link ''' site = wikipedia.getSite() # form = cgi.FieldStorage() # lang = form.getfirst('lang', 'en') [done] # family = form.getfirst('family', 'wikipedia') [done] # site = wikipedia.Site(lang, family) or wikipedia.getSite() [not needed] # page = wikipedia.Page(site, form.getfirst('page', '')) [done] # text = form.getfirst('text', '').replace('_', ' ') class GeneratorFactory(): def __init__(self): self.debug = False self.limit = None or 10 self.namespace = ['0'] self.pages = [] def api(self, query, nodeName, prefix=''): from xml.dom.minidom import parseString # FIXME assumed API path apiPath = '/w/api.php' if not prefix:prefix = nodeName data = { 'action':'query', 'format':'xml', prefix+'limit':self.limit, prefix+'namespace':'|'.join(self.namespace), } data.update(query) dom = parseString(site.getUrl(apiPath, data=data)) if self.debug: print data for node in dom.documentElement.getElementsByTagName(nodeName):wikipedia.output(node.toxml()) self.pages += [node.getAttribute('title') for node in dom.documentElement.getElementsByTagName(nodeName)] return iter((wikipedia.Page(site, node.getAttribute('title'), defaultNamespace=int(node.getAttribute('ns'))) for node in dom.documentElement.getElementsByTagName(nodeName))) def handleArg(self, arg): if arg.startswith('-debug:'): self.debug = bool(arg[7:]) print 'content-type: text/html' print wikipedia.output('pagegenerators.py debug mode enabled') elif arg.startswith('-family:') or arg.startswith('-lang:'): # Handle inside wikipedia.py pass elif arg.startswith('-text'): #reserved pass elif arg.startswith('-wpTextbox1:'): page = wikipedia.Page(site, 'Special:Textbox1') page._contents = arg[12:] elif arg.startswith('-title:'): # Reserved for pages could be submitting to the script # and it will use that as input pass elif arg.startswith('-limit:'): self.limit = int(arg[7:]) elif arg.startswith('-namespaces:'): self.namespace = arg[12:].split('|') elif arg.startswith('-namespace:'): # pywikipedia compatible? self.namespace += arg[11:].split('|') elif arg == '-help': wikipedia.showHelp() # Very hackist way of killing further output def dummy(*arg):pass wikipedia.showHelp = dummy return iter([]) # ------------------------------------------------------ # Should be separated out so order doesn't matter, i.e. -limit: after the page list # ------------------------------------------------------ elif arg == '-page:': return True elif arg.startswith('-page:') or arg.startswith('-pages:') or arg.startswith('-titles:'): [self.pages.append(page) for page in arg[arg.index(':')+1:].split('|') if page] # API allows: # 'links', 'images', 'templates', 'categories', 'allimages', 'allpages', 'alllinks', 'allcategories', 'backlinks', 'categorymembers', 'embeddedin', 'imageusage', 'search', 'watchlist', 'exturlusage', 'random' elif arg.startswith('-contribs:'): return self.api({'list':'usercontribs', 'ucuser':arg[10:]}, 'item', prefix='uc') elif arg.startswith('-templates:'): return self.api({'prop':'templates', 'titles':arg[11:]}, 'tl') elif arg.startswith('-cat:'): return self.api({'list':'categorymembers','cmtitle':arg[5:]}, 'cm') elif arg.startswith('-links:'): return self.api({'prop':'links', 'titles':arg[7:]}, 'pl') elif arg.startswith('-new'): try: self.limit = int(arg[5:]) except: self.limit = self.limit or 60 pass elif arg.startswith('-random'): return self.api({'list':'random', 'rnlimit':arg[8:] or 1}, 'page', prefix='rn') elif arg.startswith('-ref:') or arg.startswith('-backlinks:'): return self.api({'list':'backlinks', 'bltitle':arg[arg.find(':'):]}, 'bl') elif arg.startswith('-prefixindex:'): # API seems to be broken return self.api({'list':'allpages', 'apprefix':arg[13:]}, 'p', prefix='ap') elif arg.startswith('-subcat:'): pass elif arg.startswith('-weblink:'): return self.api({'list':'exturlusage', 'euquery':arg[9:]}, 'eu') # custom for [[WP:FAC]] elif arg.startswith('-headlinks:'): import re page = wikipedia.Page(wikipedia.getSite(), arg[11:]) global site site = page.site() self.pages = re.findall(r'class="mw-headline"[^<>]*>', site.getUrl('/wiki/%s' %page.urlname() )) elif arg.startswith('-boldlinks:'): # Does not quite work right yet, need to filter duplicated and namespaces import re page = wikipedia.Page(wikipedia.getSite(), arg[11:]) global site site = page.site() # FIXME Support namespaces for m in re.findall(r'()?(?(1)|)', site.getUrl('/wiki/%s' %page.urlname() )): if m[1] not in self.pages: self.pages.append(m[1]) # Parse a URL for pages # WARNING: This module should be used with caution # as any identifiable is logged by the web server elif arg.startswith('-file:http://'): # FIXME add more patterns import re for m in re.findall(r' title="(([^"]+))"', urllib.urlopen( arg[6:] ).read()): if m[1] not in self.pages: self.pages.append(m[1]) else: if self.debug: wikipedia.output('DEBUG: pagegenerator unhandled arg : %s'%arg) if self.pages != []: #FIXME should be move with the -page: statement return iter((wikipedia.Page(site, page) for page in self.pages)) else: return None def getCombinedGenerator(self, gen = None): if self.pages: return iter((wikipedia.Page(site, page) for page in self.pages)) elif gen: # FIXME combine outputs return gen def DuplicateFilterPageGenerator(generator): return generator def NamespaceFilterPageGenerator(generator, namespaces, site = None): return generator def PreloadingGenerator(generator, pageNumber=60): return generator def RedirectFilterPageGenerator(generator): # Be careful using this with -page: as it will discard redirects for page in generator: if not page.isRedirectPage(): yield page