#!/usr/bin/env python # -*- coding: utf-8 -*- import re, time import wikipedia, pagegenerators # Text for translation # # Magic words: %(title)s %(url)s %(parseddate)s lead = { 'en': u"""

Alternative text (alt text) is meant for readers who cannot see an image, such as blind readers and readers who use a text or mobile browser. It should summarize an image's appearance, and should not duplicate its caption. Every image should have alt text, except for purely decorative images, which should instead have "|link=".

The following table shows images and captions on the left, and alt text and captions on the right: the right column is what a visually impaired reader will hear. This table was computed from the copy of %(title)s cached on %(parseddate)s.

""", 'simple':u"""

Alternative text (alt text) is suppose to tell the important features to a person who cannot see the image due to non-graphical or mobile browser or blindness.

The following table shows pictures on the left and the alternative text for the picture on the right which a blind person might hear. This table was computed from the copy of %(title)s cached on %(parseddate)s.

""", } ImageDesc = { 'en': "Image and thumbnail", 'de': "Bild und miniaturbild", 'simple': "Pictures", } AltDesc = { 'en': "Text description", 'de': "alttext", 'simple': "Text for the blind", } # Regular expression for getting images ImageR = re.compile(r'''(?six) # Video posters \s* ]*>\s* ]*>\s*
(?P]*>]*alt="(?P[^"]*)"\ />)
\s* \s* \s* (?P]*>(?P\s*.*?|).*?)\s* \s* \s* | # Regex for border, frame, thumbnail/thumb, ogg video \s* ]*>(?P]*>]*class="thumbimage"[^<>]*/>)\s* (?P]*>(?P\s*.*?|).*?)\s* \s* | # Inline images ]*?>]*?\ alt="(?P[^"]*)"[^<>]*?\ width="(?P[^"]*)"\ [^<>]*?/>(?!\s*) | # decorative images ]*?\ alt="(?P[^"]*)"\ [^<>]*?/> ''') # Image notes # thumb class="thumbimage" with class="magnify" # frame class="thumbimage" without class="magnify" # border class="thumbborder" # link= Removes class="image" from def main(): genFactory = pagegenerators.GeneratorFactory() site = wikipedia.getSite() page = wikipedia.Page(site, '') for arg in wikipedia.handleArgs(): if arg.startswith('-page:'): page = wikipedia.Page(site, arg[6:]) site = page.site() elif arg.startswith('-interface:'): import cgitb; cgitb.enable() else: if not genFactory.handleArg(arg): wikipedia.output('Parameter "%s" not understood' % arg) generator = genFactory.getCombinedGenerator() or iter([page]) for page in generator: if not page.title():continue try: # We get address from the html = site.getUrl(site.nice_get_address(page.urlname())+"?"+time.strftime('%Y%m%d%H%M%S')) # Check to see if there's a history tab, if not then the page does not exist # NOTE only needed if nogzip is set if not re.search(r'
  • ]*>
  • ', html): wikipedia.output("%s does not exist on %s"%(page.aslink(), page.site().sitename())) continue # Strip to body content html = html[html.index(""):html.index("")] except Exception, e: wikipedia.output('Error: %r'%e) continue # For debugging purposes print all comments for cmt in re.findall('', html, re.DOTALL): print( cmt ) # Translation substitutions wgServer = '%s://%s'%(site.protocol(), site.hostname(),) m = re.search(r'', html) metadata = { 'title': page.title(), 'server': wgServer, 'url': wgServer+site.nice_get_address(page.urlname()), 'parseddate': m and time.strftime("%d %B %Y at %H:%M", time.strptime(m.group(1), "%Y%m%d%H%M%S")), } # print '

    %s

    '%page.title().encode('utf-8') if site.language() not in ('en', 'simple', ): print '
    ' print (wikipedia.translate(site, lead)%metadata).encode('utf-8') print '' print ''%(wikipedia.translate(site, ImageDesc), wikipedia.translate(site, AltDesc)) # images = ImageR.finditer(html) for image in images: # TODO: better support for galleries, math equation if image.group('decalt') is not None: if image.group('decalt') == '' and re.search(r' width="(\d\d|\d)"', image.group()): # Exclude images less than 100 pixels continue elif "/skins-1.5/" in image.group(): continue elif "/w/extensions/" in image.group(): continue elif ' class="tex"' in image.group(): # Exclude tex equation since they will at some point will have machine generated alt text continue print '' print '
    %s%s
    %s
    '%(image.group().replace('="/', '="%s/'%wgServer),) if image.group('videoalt') is not None: print image.group().replace(image.group('poster'), "
    %s
    "%(image.group('videoalt')=='' and 'defaultalt' or '', image.group('videoalt'))).replace(image.group('videomagnify'), '').replace('="/', '="%s/'%wgServer) elif image.group('thumbalt') is not None: print image.group().replace(image.group('image'), "
    %s
    "%(image.group('thumbalt')=='' and 'defaultalt' or '', image.group('thumbalt'))).replace(image.group('thumbmagnify'), '').replace('="/', '="%s/'%wgServer) elif image.group('imagealt') is not None: print '
    %s
    ' % (image.group('imagealt')=='' and 'defaultalt' or '', image.group('imagewidth'), image.group('imagealt') and '%s' % image.group('imagealt') or '') elif image.group('decalt') is not None: print '' % (image.group('decalt'), ) else: raise print '
    ' if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: wikipedia.startContent(form=True, head="""""") main() finally: wikipedia.endContent() wikipedia.stopme()