#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Reduced function set wikipedia.py for website access Notes: While reading through parts of the pyWikipedia code I have noticed that it underultaizes itself == Converting scripts into tools== Add to top: #!/usr/bin/env python Change the loader to the following: if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: wikipedia.startContent() main() finally: wikipedia.endContent() wikipedia.stopme() ''' import os, re, sys, time, urllib # pywikipedia imports import config # HTML debugging import cgitb; cgitb.enable(logdir='tracebacks') ## Python on Solaris handle SIGPIPE different than it does under bash linux #import signal; signal.signal(signal.SIGPIPE,signal.SIG_DFL) startTime = time.time() supportedLanguages = ( # Languages with over 100,000 articles 'ca', 'cs', 'de', 'en', 'es', 'eo', 'fr', 'it', 'hu', 'nl', 'ja', 'no', 'pl', 'pt', 'ru', 'ro', 'fi', 'sv', 'tr', 'uk', 'vo', 'zh', # Languages with over 10,000 articles 'af', 'ar', 'ast', 'ht', 'az', 'bn', 'be', 'be-x-old', 'bpy', 'bs', 'br', 'bg', 'cy', 'da', 'et', 'el', 'eu', 'fa', 'gl', 'ko', 'hi', 'hr', 'io', 'id', 'is', 'he', 'jv', 'ka', 'ku', 'la', 'lv', 'lb', 'lt', 'mk', 'mr', 'ms', 'new', 'nn', 'nap', 'oc', 'pms', 'nds', 'sq', 'scn', 'simple', 'ceb', 'sk', 'sl', 'sr', 'sh', 'su', 'tl', 'ta', 'te', 'th', 'vi', 'wa', # Languages with over 1,000 articles 'als', 'am', 'an', 'roa-rup', 'frp', 'zh-min-nan', 'map-bms', 'bh', 'bar', 'co', 'cv', 'pdc', 'dv', 'ang', 'fo', 'fy', 'fur', 'ga', 'gv', 'gd', 'gan', 'gu', 'zh-classical', 'haw', 'hy', 'hsb', 'ilo', 'ia', 'ie', 'os', 'kn', 'pam', 'csb', 'kw', 'km', 'lad', 'lij', 'li', 'ln', 'lmo', 'ml', 'mt', 'mi', 'mn', 'nah', 'nds-nl', 'ne', 'nrm', 'nov', 'uz', 'pi', 'pag', 'ps', 'kk', 'ksh', 'rm', 'qu', 'sa', 'se', 'sco', 'sw', 'roa-tara', 'tt', 'tg', 'to', 'tk', 'ur', 'vec', 'fiu-vro', 'vls', 'war', 'wuu', 'yi', 'zh-yue', 'yo', 'diq', 'bat-smg', # Hack for weird families (also present in pywikipedia) 'm', 'meta', 'commons', ) def datafilepath(*filename): path = os.path.normpath(os.path.join('../resources/', *filename)) dirs = os.path.dirname(path) if not os.path.exists(dirs): os.makedirs(dirs) return path families = { 'w': 'wikipedia', 'wikt': 'wiktionary', 'n': 'wikinews', 'b': 'wikibooks', 'q': 'wikiquote', 's': 'wikisource', 'v': 'wikiversity', 'wmf': 'wikimediafoundation', # See supportedLanguages above for the working version 'm': 'meta.wikimedia', 'mw': 'mediawiki', 'commons': 'commons', } namespaces = { -2: u'Media', -1: u'Special', 0: None, 1: u'Talk', 2: u'User', 3: u'User talk', 4: u'Wikipedia', 5: u'Wikipedia talk', 6: u'File', 7: u'File talk', 8: u'Mediawiki', # Capitlization fudged FIXME 9: u'Mediawiki talk', 10: u'Template', 11: u'Template talk', 12: u'Help', 13: u'Help talk', 14: u'Category', 15: u'Category talk', # Custom NS 100: u'Portal', 101: u'Portal talk', 108: u'Book', 109: u'Book talk', } class Family(object): def __init__(self, name): if name in families.keys(): self.name = families[name] else: self.name = name self.shortname = name # FIXME self.namespaces = namespaces def __repr__(self): return '%s.Family(%r)'%(__name__, self.name) def dbName(self, code): # returns the name of the MySQL database if self.name != 'wikipedia': return '%s%s' % (code, self.name) else: return '%swiki' % (code) class Site(object): def __init__(self, code, fam=None): self.lang = code.lower().encode('ascii') if fam: if type(fam) == type("string"): self.family = Family(fam) else: self.family = fam else: self.family = MySite.family self.messages = False def __repr__(self): return '%s.Site(%r, %r)'%(__name__, self.lang, self.family) def getUrl(self, path, retry = True, sysop = False, data = None, compress = True): url = '%s://%s%s' % (self.protocol(), self.hostname(), path) print('' % (url.encode('utf-8'),)) uo = MyURLopener() f = uo.open(url, urllib.urlencode(data or '') or None) charset = 'utf-8' # TODO: include magic number check '\x1f\x8b' at bytes 0-1 if f.info().get('Content-Encoding') in ('gzip', 'x-gzip'): import StringIO, gzip return gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() # raise ServerError("G-zip decoding not implemented. Purge the cache of %s to make it work again"%url) else: return f.read() def protocol(self): return 'http' def hostname(self): return '.'.join((self.lang, self.family.name, 'org')) def path(self): return '/w/index.php' def apipath(self): return '/w/api.php' def encoding(self): return 'utf-8' def language(self): return self.lang def sitename(self): return ':'.join([self.family.name, self.lang]) def isInterwikiLink(self, s): return any([s.startswith('%s:'%iw) or s.startswith(':%s:'%iw) for iw in supportedLanguages]) def namespace(self, num): return namespaces[num] or '' def category_namespaces(self): return (namespaces[14], ) def page_action_address(self, s, action): return '%s?title=%s&action=%s' % (self.path(), s, action) def put_address(self, s): return '%s?title=%s&action=submit' % (self.path(), s) def get_address(self, s): return '%s?title=%s&redirect=no' % (self.path(), s) def nice_get_address(self, s): return '/wiki/%s' % (s) def dbName(self): """Return MySQL database name.""" return self.family.dbName(self.lang.replace('-', '_')) class Page(object): def __init__(self, site, title, insite=None, defaultNamespace=None): self._site = site self._namespace = None try: title = unicode(title) except: try:title = unicode(title, 'utf-8') except:title = unicode(title, 'latin-1') # FIXME: remove need to test for ASCII if title.encode('ascii', 'ignore') == title: try: title = unicode(urllib.unquote(title.encode('ascii')), 'utf-8') except: raise NoPage('Invalid esacped character in [[%s]]!' % title) title = html2unicode(title) # decode html references title = title.replace('_', ' ').strip() for part in title.split(':')[:-1]: prefix = part.lower().strip() # FIXME support ":title" in defaultNamespace # FIXME support other languages # FIXME better reverse lookup if prefix == '': self._namespace = 0 title = title[len(prefix)+1:] break elif prefix.capitalize() in namespaces.values(): #self._namespace = dict(zip(namespaces.values(), namespaces.keys())).index(prefix.capitalize()) # Now without the for loops? self._namespace = namespaces.keys()[namespaces.values().index(prefix.capitalize())] break elif prefix in supportedLanguages: # Changing Languages if prefix!=self._site.lang: self._site = Site(prefix, self._site.family) title = title[len(prefix)+1:] elif prefix in families.keys(): # Changing Families if prefix!=self._site.family.shortname: self._site = Site(self._site.lang, Family(prefix)) title = title[len(prefix)+1:] else: break # Clean up title # FIXME better code if self._namespace == None: self._namespace = defaultNamespace or 0 if defaultNamespace: title = namespaces[defaultNamespace]+':'+title self._title = title.strip()# is this needed now??? -> .replace('_', ' ').strip('\t :') def __repr__(self): return '%s.Page(%r, %r)'%(__name__, self._site, self._title) def site(self): return self._site def latestRevision(self): return 0 def canBeEdited(self): return True def isTalkPage(self): return self._namespace % 2 == 1 def protocol(self): return self._site.protocol() def namespace(self): return self._namespace def hostname(self): return self._site.hostname() def path(self): return '/w/index.php' def urlname(self): return urllib.quote(self.title(underscore=True).encode('utf-8')) def exists(self): try: return self.get() and True except:return False def getReferences(self, follow_redirects=True, withTemplateInclusion=True, onlyTemplateInclusion=False, redirectsOnly=False): pass import pagegenerators gf = pagegenerators.GeneratorFactory() gf.limit=500 return gf.api({'list':'backlinks','bltitle':self.title(),}, 'bl') def linkedPages(self): try: return [Page(self.site(), m.group(1)) for m in re.finditer(r'\[\[([^][|<>#\n]+)[#|][^][\n]*\]\]', self.get())] except NoPage: raise except IsRedirectPage: raise except SectionError: return [] def isRedirectPage(self): try: return self.get() and False except IsRedirectPage: return True except: return False def title(self, underscore = False): if underscore: return self._title.replace(' ', '_') else: return self._title def titleWithoutNamespace(self, underscore=False): if self.namespace() == 0: return self.title(underscore=underscore) else: return self.title(underscore=underscore).split(':', 1)[1] def aslink(self, forceInterwiki=False): if forceInterwiki or self.site() != getSite(): if self.site().family != getSite().family: return u'[[%s:%s:%s]]' % (self.site().family.shortname, self.site().lang, self.title()) else: return u'[[%s:%s]]' % (self.site().lang, self.title()) else: return '[[%s]]' % self.title() def get(self, force=False, get_redirect=False, throttle=True, sysop=False, nofollow_redirects=False, change_edit_time=True): # FIXME add redirect following # all NoPage exception are contained here for convience for illegalChar in u'#<>[]|{}\n\ufffd': if illegalChar in self.title(): raise NoPage('Illegal character in %s!' % self.aslink()) if not self._title: raise NoPage("No title") if not hasattr(self, '_contents'):# or not self._contents or force: text = self._site.getUrl("%s?title=%s&action=%s"%(self.path(), self.urlname(), 'edit')) #uo = MyURLopener() #print('' % (self.protocol(), self.hostname(), self.path(), self.urlname(), 'edit')) #f = uo.open('%s://%s%s?title=%s&action=%s' % (self.protocol(), self.hostname(), self.path(), self.urlname(), 'edit')) charset = 'utf-8' #text = f.read() self._html = text if not '' in text: raise NoPage("No textarea found") elif not '