# This is hack together sub function for pagegenerators import urllib, wikipedia parameterHelp = ''' -cat: Work on pages in a specific category -headlinks: Works on all links that are in heading tags (== [[link]] ==) -links: Works on all links on the given page -new Works on the 10 newest pages -page: name of the page you want to work on -prefixindex: Works on all pages with the given prefix -ref: Work on all pages linking to the given page -subcat: -titles: -weblink: Works on all pages with the given external link ''' site = wikipedia.getSite() # form = cgi.FieldStorage() # lang = form.getfirst('lang', 'en') [done] # family = form.getfirst('family', 'wikipedia') [done] # site = wikipedia.Site(lang, family) or wikipedia.getSite() [not needed] # page = wikipedia.Page(site, form.getfirst('page', '')) [done] # text = form.getfirst('text', '').replace('_', ' ') class GeneratorFactory(): def __init__(self): self.debug = False self.limit = None or 10 self.namespace = ['0'] self.pages = [] self.gens = [] self.namespaces = [] def api(self, query, nodeName, prefix=''): from xml.dom.minidom import parseString # FIXME assumed API path apiPath = '/w/api.php' if not prefix:prefix = nodeName data = { 'action':'query', 'format':'xml', prefix+'limit':self.limit, prefix+'namespace':'|'.join(self.namespace), } data.update(query) # # URT-8 encode # for key in data: # if type(data[key]) == type(u'unicode'): # data[key] = data[key].encode('utf-8') dom = parseString(site.getUrl(apiPath, data=data)) if self.debug: print data for node in dom.documentElement.getElementsByTagName(nodeName):wikipedia.output(node.toxml()) #print '"', site.getUrl(apiPath, data=data), '"' self.pages += [node.getAttribute('title') for node in dom.documentElement.getElementsByTagName(nodeName)] return iter((wikipedia.Page(site, node.getAttribute('title'), defaultNamespace=int(node.getAttribute('ns'))) for node in dom.documentElement.getElementsByTagName(nodeName))) def handleArg(self, arg): gen = None site = wikipedia.getSite() if arg.startswith('-debug:'): self.debug = bool(arg[7:]) print '' wikipedia.output('pagegenerators.py debug mode enabled') elif arg.startswith('-family:') or arg.startswith('-lang:'): # Handle inside wikipedia.py pass elif arg.startswith('-text'): #reserved pass elif arg.startswith('-wpTextbox1:'): page = wikipedia.Page(site, 'Special:Textbox1') page._contents = arg[12:] elif arg.startswith('-title:'): # Reserved for pages could be submitting to the script # and it will use that as input pass elif arg.startswith('-limit:'): self.limit = int(arg[7:]) elif arg.startswith('-namespaces:'): self.namespace = arg[12:].split('|') elif arg.startswith('-namespace:'): # pywikipedia compatible? self.namespace += arg[11:].split('|') elif arg == '-help': wikipedia.showHelp() # Very hackist way of killing further output def dummy(*arg):pass wikipedia.showHelp = dummy return iter([]) # ------------------------------------------------------ # Should be separated out so order doesn't matter, i.e. -limit: after the page list # ------------------------------------------------------ #elif arg == '-page:': # return True elif arg.startswith('-page:') or arg.startswith('-pages:') or arg.startswith('-titles:'): gen = iter((wikipedia.Page(site, page) for page in arg[arg.index(':')+1:].split('|') if page)) # API allows: # 'links', 'images', 'templates', 'categories', 'allimages', 'allpages', 'alllinks', 'allcategories', 'backlinks', 'categorymembers', 'embeddedin', 'imageusage', 'search', 'watchlist', 'exturlusage', 'random' elif arg.startswith('-contribs:'): gen = self.api({'list':'usercontribs', 'ucuser':arg[10:]}, 'item', prefix='uc') elif arg.startswith('-templates:'): gen = self.api({'prop':'templates', 'titles':arg[11:]}, 'tl') elif arg.startswith('-cat:'): gen = self.api({'list':'categorymembers','cmtitle':arg.lower().startswith('-cat:category:') and arg[5:] or 'Category:'+arg[5:]}, 'cm') elif arg.startswith('-links:'): gen = self.api({'prop':'links', 'titles':arg[7:]}, 'pl') elif arg.startswith('-new'): try: self.limit = int(arg[5:]) except: self.limit = self.limit or 60 pass elif arg.startswith('-random'): gen = self.api({'list':'random', 'rnlimit':arg[8:] or 1}, 'page', prefix='rn') elif arg.startswith('-ref:') or arg.startswith('-backlinks:'): gen = self.api({'list':'backlinks', 'bltitle':arg[arg.find(':'):]}, 'bl') elif arg.startswith('-prefixindex:'): # API seems to be broken gen = self.api({'list':'allpages', 'apprefix':arg[13:]}, 'p', prefix='ap') elif arg.startswith('-subcat:'): pass elif arg.startswith('-weblink:'): gen = self.api({'list':'exturlusage', 'euquery':arg[9:]}, 'eu') # custom for [[WP:FAC]] elif arg.startswith('-headlinks:'): import re p = wikipedia.Page(site, arg[11:]) gen = iter((wikipedia.Page(p.site(), m.group('page')) for m in re.finditer(r'class="mw-headline"[^<>]*>', site.getUrl('/wiki/%s' % p.urlname() )))) elif arg.startswith('-boldlinks:'): # Does not quite work right yet, need to filter duplicated and namespaces import re p = wikipedia.Page(site, arg[11:]) # FIXME Support namespaces gen = DuplicateFilterPageGenerator(NamespaceFilterPageGenerator( iter((wikipedia.Page(p.site(), m.group('page')) for m in re.finditer(r'()?(?(1)|)', site.getUrl('/wiki/%s' % p.urlname() )) )), [int(s) for s in self.namespaces] or [0] )) # Parse a URL for pages # WARNING: This module should be used with caution # as any identifiable is logged by the web server elif arg.startswith('-file:http://'): # FIXME add more patterns import re for m in re.findall(r' title="(([^"]+))"', urllib.urlopen( arg[6:] ).read()): if m[1] not in self.pages: self.pages.append(m[1]) else: if self.debug: wikipedia.output('DEBUG: pagegenerator unhandled arg : %s'%arg) if self.debug: wikipedia.output('DEBUG: %r; self.pages = %r'%(arg, self.pages,)) #if self.pages != []: # #FIXME should be move with the -page: statement # #return iter((wikipedia.Page(site, page) for page in self.pages)) if gen: self.gens.append(gen) return self.getCombinedGenerator() else: return False def getCombinedGenerator(self, gen = None): if gen: self.gens.insert(0, gen) if self.gens: if len(self.gens) == 1: return self.gens[0] else: return CombinedPageGenerator(self.gens) elif self.pages: return iter((wikipedia.Page(site, page) for page in self.pages)) else: return None def CombinedPageGenerator(generators): for generator in generators: for page in generator: yield page def DuplicateFilterPageGenerator(generator): seenPages = dict() for page in generator: _page = u"%s:%s:%s" % (page._site.family.name, page._site.lang, page._title) if _page not in seenPages: seenPages[_page] = True yield page def NamespaceFilterPageGenerator(generator, namespaces, site = None): # TODO convert namespace string into numbers for page in generator: if page.namespace() in namespaces: yield page def PreloadingGenerator(generator, pageNumber=60): return generator def RedirectFilterPageGenerator(generator): return generator ## Be careful using this with -page: as it will discard redirects #for page in generator: # if not page.isRedirectPage(): # yield page