#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
¶ms;
-test Test the routines used for regession testing
-namespace:n Number or name of namespace to process. The parameter can be
more than one to add additional namespaces
commonfixes applied fixes which are general and specific to the English Wikipedia
"""
# TODO
# TIP: use "%(dictname)s" % groupdict() a
# better ref combining , combine urls and on ignoring a list of character (matching)
# Seperate English from generic wikisyntax
# Seperate enwiki sepefic
# steel stuff from
# http://en.wikipedia.org/wiki/User:Polbot/source/Reffix.pl
# FIXME:
# http://en.wikipedia.org/w/index.php?title=London&diff=prev&oldid=253531178 (infobox)
# http://en.wikipedia.org/w/index.php?title=Hoover_Dam&diff=prev&oldid=253529821
# FIXME:
# http://en.wikipedia.org/w/index.php?title=Rolls-Royce_RR300&diff=190562064&oldid=175311735
# http://www.nationaltrust.org/magazine/archives/arc_news_2007/010807.htm
# http://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1186&context=theses
import re, urllib
import wikipedia, pagegenerators
try:
import noreferences
except ImportError:
noreferences = None
if True:
import MySQLdb
else:
MySQLdb = False
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp,
}
ignoreAsNames = (
'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december',
'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
)
# NOT IMPLEMENTED PROPERLY
# Will change work/publisher cite news and |agency="dictvalue"
agencies = {
"AP": "Associated Press",
"The Associated Press": "Associated Press",
"Associated Press": "Associated Press",
"AP News": "Associated Press",
# "DPA": "Deutsche Presse-Agentur",
# "AFP": "Agence France-Presse",
}
# "The" will be stripped if it exist
# So don't include Edge case e.g. "People" and "The People"
commonPublishers = (
"American Journalism Review",
"Associated Press",
"BBC News",
"BBC",
"Boston Globe",
"Chicago Tribune",
"CNN",
"Daily Telegraph",
"Economist",
"Guardian",
"Huffington Post",
"International Herald Tribune",
"MTV",
"New York Times",
"NY Times",
"Observer",
"The Times",
"The Register",
"San Francisco Chronicle",
"Scientific American",
"Seattle Times",
"Reuters",
"Rolling Stone",
"Wall Street Journal",
"Washington Post",
# Web only sources
"IGN",
"GameStop",
"Electronic Gaming Monthly",
"Kotaku",
"Ars Technica",
"Joystiq",
"Tom's Hardware",
"Salon",
"United Press International", # since 1907
)
# template choser
# not implemented yet
tpl_cite = (
# Match templates, replace template, regex condition
('cite web', 'cite encyclopedia', r'\|\s*url\s*=\s*http://(www\.)?(encarta.com|encarta.msn.com|betanitca.com)'),
('cite web', 'cite news', r'\|\s*url\s*=\s*http://(www\.)?(nytimes.com|ap.google.com|news\.bbc\.co\.uk|time\.com|economist\.com|timesonline\.co\.uk|channelonline\.tv|cnn\.com|independent\.co\.uk|cbc.ca|theglobeandmail.com)/'),
('cite web', 'cite paper', r'\|\s*url\s*=\s*http://(www\.)?(havard.edu)'),
('cite web', 'cite news', r'\|\s*agency\s*='),
('cite web', 'cite book', r'\|\s*isbn\s*=\s*[^\s{|}[\]]'),
)
htmltags = (
# pairs
"b", "i", "u", "font", "big", "small", "sub", "sup", "h1",
"h2", "h3", "h4", "h5", "h6", "cite", "code", "em", "s", "span",
"strike", "strong", "tt", "var", "div", "center",
"blockquote", "ol", "ul", "dl", "table", "caption", "pre",
"ruby", "rt" , "rb" , "rp",
# single
"br", "p", "hr", "li", "dt", "dd",
# nest
"table", "tr", "td", "th", "div", "blockquote", "ol", "ul",
"dl", "font", "big", "small", "sub", "sup",
# table tags
"td", "th", "tr",
)
htmlattrs = (
"title", "align", "lang", "dir", "width", "height",
"bgcolor", "clear", "noshade",
"cite", "size", "face", "color",
"type", "start", "value", "compact",
#/* For various lists, mostly deprecated but safe */
"summary", "width", "border", "frame", "rules",
"cellspacing", "cellpadding", "valign", "char",
"charoff", "colgroup", "col", "span", "abbr", "axis",
"headers", "scope", "rowspan", "colspan",
"id", "class", "name", "style"
)
# CSS HEX color values to named (<9 chars) color table
namedColors = {'#00FFFF': 'aqua', '#F0FFFF': 'azure', '#F5F5DC': 'beige', '#FFE4C4': 'bisque', '#000000': 'black', '#0000FF': 'blue', '#A52A2A': 'brown', '#FF7F50': 'coral', '#FFF8DC': 'cornsilk', '#DC143C': 'crimson', '#00FFFF': 'cyan', '#00008B': 'darkBlue', '#008B8B': 'darkCyan', '#A9A9A9': 'darkGray', '#A9A9A9': 'darkGrey', '#8B0000': 'darkRed', '#FF1493': 'deepPink', '#696969': 'dimGray', '#696969': 'dimGrey', '#FF00FF': 'fuchsia', '#FFD700': 'gold', '#808080': 'gray', '#808080': 'grey', '#008000': 'green', '#F0FFF0': 'honeyDew', '#FF69B4': 'hotPink', '#4B0082': 'indigo', '#FFFFF0': 'ivory', '#F0E68C': 'khaki', '#E6E6FA': 'lavender', '#00FF00': 'lime', '#FAF0E6': 'linen', '#FF00FF': 'magenta', '#800000': 'maroon', '#FFE4B5': 'moccasin', '#000080': 'navy', '#FDF5E6': 'oldLace', '#808000': 'olive', '#FFA500': 'orange', '#DA70D6': 'orchid', '#CD853F': 'peru', '#FFC0CB': 'pink', '#DDA0DD': 'plum', '#800080': 'purple', '#FF0000': 'red', '#FA8072': 'salmon', '#2E8B57': 'seaGreen', '#FFF5EE': 'seaShell', '#A0522D': 'sienna', '#C0C0C0': 'silver', '#87CEEB': 'skyBlue', '#FFFAFA': 'snow', '#D2B48C': 'tan', '#008080': 'teal', '#D8BFD8': 'thistle', '#FF6347': 'tomato', '#EE82EE': 'violet', '#F5DEB3': 'wheat', '#FFFFFF': 'white', '#FFFF00': 'yellow',
}
# Interwiki map for converting links to interwiki form
# Table format | NAME || URI
interwiki_map = {
"AbbeNormal": "http://ourpla.net/cgi/pikie?$1",
"Acronym": "http://www.acronymfinder.com/af-query.asp?String=exact&Acronym=$1",
"advisory": "http://advisory.wikimedia.org/wiki/$1",
"Advogato": "http://www.advogato.org/$1",
"Aew": "http://wiki.arabeyes.org/$1",
"Airwarfare": "http://airwarfare.com/mediawiki-1.4.5/index.php?$1",
"AIWiki": "http://www.ifi.unizh.ch/ailab/aiwiki/aiw.cgi?$1",
"AllWiki": "http://allwiki.com/index.php/$1",
"Appropedia": "http://www.appropedia.org/$1",
"AquariumWiki": "http://www.theaquariumwiki.com/$1",
"arXiv": "http://arxiv.org/abs/$1",
"AspieNetWiki": "http://aspie.mela.de/index.php/$1",
"AtmWiki": "http://www.otterstedt.de/wiki/index.php/$1",
"BattlestarWiki": "http://en.battlestarwiki.org/wiki/$1",
"BEMI": "http://bemi.free.fr/vikio/index.php?$1",
"BenefitsWiki": "http://www.benefitslink.com/cgi-bin/wiki.cgi?$1",
"BibleWiki": "http://bible.tmtm.com/wiki/$1",
"BluWiki": "http://www.bluwiki.org/go/$1",
"Botwiki": "http://botwiki.sno.cc/wiki/$1",
"Boxrec": "http://www.boxrec.com/media/index.php?$1",
"BrickWiki": "http://brickwiki.org/index.php?title=$1",
"BridgesWiki": "http://c2.com:8000/$1",
"bugzilla": "https://bugzilla.wikimedia.org/show_bug.cgi?id=$1",
"buzztard": "http://buzztard.org/index.php/$1",
"Bytesmiths": "http://www.Bytesmiths.com/wiki/$1",
"C2find": "http://c2.com/cgi/wiki?FindPage&value=$1",
"Cache": "http://www.google.com/search?q=cache:$1",
"CanyonWiki": "http://www.canyonwiki.com/wiki/index.php/$1",
"CANWiki": "http://www.can-wiki.info/$1",
"CellWiki": "http://cell.wikia.com/wiki/$1",
"CentralWikia": "http://www.wikia.com/wiki/$1",
"ChoralWiki": "http://www.cpdl.org/wiki/index.php/$1",
"Ciscavate": "http://ciscavate.org/index.php/$1",
"Citizendium": "http://en.citizendium.org/wiki/$1",
"CKWiss": "http://ck-wissen.de/ckwiki/index.php?title=$1",
"CNDbName": "http://cndb.com/actor.html?name=$1",
"CNDbTitle": "http://cndb.com/movie.html?title=$1",
"CoLab": "http://colab.info",
"Comixpedia": "http://www.comixpedia.org/index.php?title=$1",
"comcom": "http://comcom.wikimedia.org/wiki/$1",
"CommunityScheme": "http://community.schemewiki.org/?c=s&key=$1",
"comune": "http://rete.comuni-italiani.it/wiki/$1",
"Consciousness": "http://teadvus.inspiral.org/index.php/$1",
"CorpKnowPedia": "http://corpknowpedia.org/wiki/index.php/$1",
"CrazyHacks": "http://www.crazy-hacks.org/wiki/index.php?title=$1",
"CreaturesWiki": "http://creatures.wikia.com/wiki/$1",
"DAwiki": "http://www.dienstag-abend.de/wiki/index.php/$1",
"Dcc": "http://www.dccwiki.com/$1",
"DCDatabase": "http://www.dcdatabaseproject.com/wiki/$1",
"DCMA": "http://www.christian-morgenstern.de/dcma/$1",
"DejaNews": "http://www.deja.com/=dnc/getdoc.xp?AN=$1",
"Delicious": "http://del.icio.us/tag/$1",
"Demokraatia": "http://wiki.demokraatia.ee/index.php/$1",
"Devmo": "http://developer.mozilla.org/en/docs/$1",
"dict": "http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=$1",
"Disinfopedia": "http://www.sourcewatch.org/wiki.phtml?title=$1",
"distributedproofreaders": "http://www.pgdp.net/wiki/$1",
"distributedproofreadersca": "http://www.pgdpcanada.net/wiki/index.php/$1",
"dmoz": "http://www.dmoz.org/$1",
"dmozs": "http://www.dmoz.org/cgi-bin/search?search=$1",
"DocBook": "http://wiki.docbook.org/topic/$1",
"DOI": "http://dx.doi.org/$1",
"doom_wiki": "http://doom.wikia.com/wiki/$1",
"download": "http://download.wikimedia.org/$1",
"DRAE": "http://buscon.rae.es/draeI/SrvltGUIBusUsual?LEMA=$1",
"Dreamhost": "http://wiki.dreamhost.com/index.php/$1",
"DrumCorpsWiki": "http://www.drumcorpswiki.com/index.php/$1",
"DWJWiki": "http://www.suberic.net/cgi-bin/dwj/wiki.cgi?$1",
"EcoReality": "http://www.EcoReality.org/wiki/$1",
"EfnetCeeWiki": "http://purl.net/wiki/c/$1",
"EfnetCppWiki": "http://purl.net/wiki/cpp/$1",
"EfnetPythonWiki": "http://purl.net/wiki/python/$1",
"EfnetXmlWiki": "http://purl.net/wiki/xml/$1",
"ELibre": "http://enciclopedia.us.es/index.php/$1",
"EmacsWiki": "http://www.emacswiki.org/cgi-bin/wiki.pl?$1",
"EnergieWiki": "http://www.netzwerk-energieberater.de/wiki/index.php/$1",
"EoKulturCentro": "http://esperanto.toulouse.free.fr/nova/wikini/wakka.php?wiki=$1",
"Ethnologue": "http://www.ethnologue.com/show_language.asp?code=$1",
"EvoWiki": "http://wiki.cotch.net/index.php/$1",
"Exotica": "http://www.exotica.org.uk/wiki/$1",
"FanimutationWiki": "http://wiki.animutationportal.com/index.php/$1",
"FinalEmpire": "http://final-empire.sourceforge.net/cgi-bin/wiki.pl?$1",
"FinalFantasy": "http://finalfantasy.wikia.com/wiki/$1",
"Finnix": "http://www.finnix.org/$1",
"FlickrUser": "http://www.flickr.com/people/$1",
"FloralWIKI": "http://www.floralwiki.co.uk/wiki/$1",
"FlyerWiki-de": "http://de.flyerwiki.net/index.php/$1",
"Foldoc": "http://www.foldoc.org/$1",
"ForthFreak": "http://wiki.forthfreak.net/index.cgi?$1",
"FoxWiki": "http://fox.wikis.com/wc.dll?Wiki~$1",
"FreeBio": "http://freebiology.org/wiki/$1",
"FreeBSDman": "http://www.FreeBSD.org/cgi/man.cgi?apropos=1&query=$1",
"FreeCultureWiki": "http://wiki.freeculture.org/index.php/$1",
"Freedomdefined": "http://freedomdefined.org/$1",
"FreeFeel": "http://freefeel.org/wiki/$1",
"FreekiWiki": "http://wiki.freegeek.org/index.php/$1",
"ganfyd": "http://ganfyd.org/index.php?title=$1",
"GaussWiki": "http://gauss.ffii.org/$1",
"Gentoo-Wiki": "http://gentoo-wiki.com/$1",
"GenWiki": "http://wiki.genealogy.net/index.php/$1",
"GlobalVoices": "http://cyber.law.harvard.edu/dyn/globalvoices/wiki/$1",
"GlossarWiki": "http://glossar.hs-augsburg.de/$1",
"GlossaryWiki": "http://glossary.hs-augsburg.de/$1",
"Golem": "http://golem.linux.it/index.php/$1",
"Google": "http://www.google.com/search?q=$1",
"GoogleDefine": "http://www.google.com/search?q=define:$1",
"GoogleGroups": "http://groups.google.com/groups?q=$1",
"GotAMac": "http://www.got-a-mac.org/$1",
"GreatLakesWiki": "http://greatlakeswiki.org/index.php/$1",
"Guildwiki": "http://guildwars.wikia.com/wiki/$1",
"gutenberg": "http://www.gutenberg.org/etext/$1",
"gutenbergwiki": "http://www.gutenberg.org/wiki/$1",
"H2Wiki": "http://halowiki.net/p/$1",
"HammondWiki": "http://www.dairiki.org/HammondWiki/index.php3?$1",
"heroeswiki": "http://heroeswiki.com/$1",
"HerzKinderWiki": "http://www.herzkinderinfo.de/Mediawiki/index.php/$1",
"HKMule": "http://www.hkmule.com/wiki/$1",
"HolshamTraders": "http://www.holsham-traders.de/wiki/index.php/$1",
"HRWiki": "http://www.hrwiki.org/index.php/$1",
"HRFWiki": "http://fanstuff.hrwiki.org/index.php/$1",
"HumanCell": "http://www.humancell.org/index.php/$1",
"HupWiki": "http://wiki.hup.hu/index.php/$1",
"IMDbName": "http://www.imdb.com/name/nm$1/",
"IMDbTitle": "http://www.imdb.com/title/tt$1/",
"IMDbCompany": "http://www.imdb.com/company/co$1/",
"IMDbCharacter": "http://www.imdb.com/character/ch$1/",
"Incubator": "http://incubator.wikimedia.org/wiki/$1",
"infoAnarchy": "http://www.infoanarchy.org/en/$1",
"Infosecpedia": "http://www.infosecpedia.org/pedia/index.php/$1",
"Infosphere": "http://theinfosphere.org/$1",
"irc": "irc://irc.freenode.net/$1",
"Iuridictum": "http://iuridictum.pecina.cz/w/$1",
"JamesHoward": "http://jameshoward.us/$1",
"JavaNet": "http://wiki.java.net/bin/view/Main/$1",
"Javapedia": "http://wiki.java.net/bin/view/Javapedia/$1",
"JEFO": "http://esperanto-jeunes.org/wiki/$1",
"JiniWiki": "http://www.cdegroot.com/cgi-bin/jini?$1",
"JspWiki": "http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=$1",
"JSTOR": "http://www.jstor.org/journals/$1",
"Kamelo": "http://kamelopedia.mormo.org/index.php/$1",
"Karlsruhe": "http://ka.stadtwiki.net/$1",
"KerimWiki": "http://wiki.oxus.net/$1",
"KinoWiki": "http://kino.skripov.com/index.php/$1",
"KmWiki": "http://kmwiki.wikispaces.com/$1",
"KontuWiki": "http://kontu.merri.net/wiki/$1",
"KoslarWiki": "http://wiki.koslar.de/index.php/$1",
"Kpopwiki": "http://www.kpopwiki.com/$1",
"LinguistList": "http://linguistlist.org/forms/langs/LLDescription.cfm?code=$1",
"LISWiki": "http://liswiki.org/wiki/$1",
"LiteratePrograms": "http://en.literateprograms.org/$1",
"Livepedia": "http://www.livepedia.gr/index.php?title=$1",
"Lojban": "http://www.lojban.org/tiki/tiki-index.php?page=$1",
"Lostpedia": "http://lostpedia.wikia.com/wiki/$1",
"LQWiki": "http://wiki.linuxquestions.org/wiki/$1",
"LugKR": "http://lug-kr.sourceforge.net/cgi-bin/lugwiki.pl?$1",
"Luxo": "http://toolserver.org/~luxo/contributions/contributions.php?user=$1",
"lyricwiki": "http://www.lyricwiki.org/$1",
"mail": "https://lists.wikimedia.org/mailman/listinfo/$1",
"mailarchive": "http://lists.wikimedia.org/pipermail/$1",
"Mariowiki": "http://www.mariowiki.com/$1",
"MarvelDatabase": "http://www.marveldatabase.com/wiki/index.php/$1",
"MeatBall": "http://www.usemod.com/cgi-bin/mb.pl?$1",
"MemoryAlpha": "http://memory-alpha.org/en/wiki/$1",
"MetaWiki": "http://sunir.org/apps/meta.pl?$1",
"Mineralienatlas": "http://www.mineralienatlas.de/lexikon/index.php/$1",
"MoinMoin": "http://moinmo.in/$1",
"Monstropedia": "http://www.monstropedia.org/?title=$1",
"MosaPedia": "http://mosapedia.de/wiki/index.php/$1",
"MozCom": "http://mozilla.wikia.com/wiki/$1",
"MozillaWiki": "http://wiki.mozilla.org/$1",
"MozillaZineKB": "http://kb.mozillazine.org/$1",
"MusicBrainz": "http://wiki.musicbrainz.org/$1",
"MW": "http://www.mediawiki.org/wiki/$1",
"MWOD": "http://www.merriam-webster.com/cgi-bin/dictionary?book=Dictionary&va=$1",
"MWOT": "http://www.merriam-webster.com/cgi-bin/thesaurus?book=Thesaurus&va=$1",
"NetVillage": "http://www.netbros.com/?$1",
"NKcells": "http://www.nkcells.info/wiki/index.php/$1",
"NoSmoke": "http://no-smok.net/nsmk/$1",
"Nost": "http://nostalgia.wikipedia.org/wiki/$1",
"OEIS": "http://www.research.att.com/~njas/sequences/$1",
"OldWikisource": "http://wikisource.org/wiki/$1",
"OLPC": "http://wiki.laptop.org/go/$1",
"OneLook": "http://www.onelook.com/?ls=b&w=$1",
"OpenFacts": "http://openfacts.berlios.de/index.phtml?title=$1",
"Openstreetmap": "http://wiki.openstreetmap.org/wiki/$1",
"OpenWetWare": "http://openwetware.org/wiki/$1",
"OpenWiki": "http://openwiki.com/?$1",
"Opera7Wiki": "http://operawiki.info/$1",
"OrganicDesign": "http://www.organicdesign.co.nz/$1",
"OrgPatterns": "http://www.bell-labs.com/cgi-user/OrgPatterns/OrgPatterns?$1",
"OrthodoxWiki": "http://orthodoxwiki.org/$1",
"OSI reference model": "http://wiki.tigma.ee/index.php/$1",
"OTRS": "https://ticket.wikimedia.org/otrs/index.pl?Action=AgentTicketZoom&TicketID=$1",
"OTRSwiki": "http://otrs-wiki.wikimedia.org/wiki/$1",
"OurMedia": "http://www.socialtext.net/ourmedia/index.cgi?$1",
"PaganWiki": "http://www.paganwiki.org/wiki/index.php?title=$1",
"Panawiki": "http://wiki.alairelibre.net/wiki/$1",
"PangalacticOrg": "http://www.pangalactic.org/Wiki/$1",
"PerlConfWiki": "http://perl.conf.hu/index.php/$1",
"PerlNet": "http://perl.net.au/wiki/$1",
"PersonalTelco": "http://www.personaltelco.net/index.cgi/$1",
"PHWiki": "http://wiki.pocketheaven.com/$1",
"PhpWiki": "http://phpwiki.sourceforge.net/phpwiki/index.php?$1",
"PlanetMath": "http://planetmath.org/?op=getobj&from=objects&id=$1",
"PMEG": "http://www.bertilow.com/pmeg/$1.php",
"PMWiki": "http://old.porplemontage.com/wiki/index.php/$1",
"PurlNet": "http://purl.oclc.org/NET/$1",
"pyrev": "http://svn.wikimedia.org/viewvc/pywikipedia?view=rev&revision=$1",
"PythonInfo": "http://www.python.org/cgi-bin/moinmoin/$1",
"PythonWiki": "http://www.pythonwiki.de/$1",
"psycle": "http://psycle.sourceforge.net/wiki/$1",
"qcwiki": "http://wiki.quantumchemistry.net/index.php/$1",
"quality": "http://quality.wikimedia.org/wiki/$1",
"Qwiki": "http://qwiki.caltech.edu/wiki/$1",
"r3000": "http://prinsig.se/weekee/$1",
"RakWiki": "http://rakwiki.no-ip.info/$1",
"Raec": "http://www.raec.clacso.edu.ar:8080/raec/Members/raecpedia/$1",
"rev": "http://www.mediawiki.org/wiki/Special:Code/MediaWiki/$1",
"ReVo": "http://purl.org/NET/voko/revo/art/$1.html",
"RFC": "http://tools.ietf.org/html/rfc$1",
"RheinNeckar": "http://wiki.rhein-neckar.de/index.php/$1",
"RoboWiki": "http://robowiki.net/?$1",
"ReutersWiki": "http://glossary.reuters.com/index.php/$1",
"RoWiki": "http://wiki.rennkuckuck.de/index.php/$1",
"rtfm": "ftp://rtfm.mit.edu/pub/faqs/$1",
"S23Wiki": "http://s23.org/wiki/$1",
"Scholar": "http://scholar.google.com/scholar?q=$1",
"SchoolsWP": "http://schools-wikipedia.org/wiki/$1",
"Scores": "http://www.imslp.org/wiki/$1",
"Scoutwiki": "http://en.scoutwiki.org/$1",
"Scramble": "http://www.scramble.nl/wiki/index.php?title=$1",
"SeaPig": "http://www.seapig.org/$1",
"SeattleWiki": "http://seattlewiki.org/wiki/$1",
"SeattleWireless": "http://seattlewireless.net/?$1",
"SLWiki": "http://wiki.secondlife.com/wiki/$1",
"SenseisLibrary": "http://senseis.xmp.net/?$1",
"silcode": "http://www.sil.org/iso639-3/documentation.asp?id=$1",
"Shakti": "http://cgi.algonet.se/htbin/cgiwrap/pgd/ShaktiWiki/$1",
"Slashdot": "http://slashdot.org/article.pl?sid=$1",
"SMikipedia": "http://www.smiki.de/$1",
"SourceForge": "http://sourceforge.net/$1",
"spcom": "http://spcom.wikimedia.org/wiki/$1",
"species": "http://species.wikimedia.org/wiki/$1",
"Squeak": "http://wiki.squeak.org/squeak/$1",
"stable": "http://stable.toolserver.org/$1",
"strategy" : "http://strategy.wikimedia.org/wiki/$1",
"StrategyWiki": "http://strategywiki.org/wiki/$1",
"sulutil": "http://toolserver.org/~vvv/sulutil.php?user=$1",
"Susning": "http://www.susning.nu/$1",
"Swtrain": "http://train.spottingworld.com/$1",
"svn": "http://svn.wikimedia.org/viewvc/mediawiki/$1?view=log",
"SVGWiki": "http://www.protocol7.com/svg-wiki/default.asp?$1",
"SwinBrain": "http://mercury.it.swin.edu.au/swinbrain/index.php/$1",
"SwingWiki": "http://www.swingwiki.org/$1",
"TabWiki": "http://www.tabwiki.com/index.php/$1",
"Takipedia": "http://www.takipedia.org/wiki/$1",
"Tavi": "http://tavi.sourceforge.net/$1",
"TclersWiki": "http://wiki.tcl.tk/$1",
"Technorati": "http://www.technorati.com/search/$1",
"TEJO": "http://www.tejo.org/vikio/$1",
"TESOLTaiwan": "http://www.tesol-taiwan.org/wiki/index.php/$1",
"Testwiki": "http://test.wikipedia.org/wiki/$1",
"Thelemapedia": "http://www.thelemapedia.org/index.php/$1",
"Theopedia": "http://www.theopedia.com/$1",
"ThePPN": "http://wiki.theppn.org/$1",
"ThinkWiki": "http://www.thinkwiki.org/wiki/$1",
"TibiaWiki": "http://tibia.erig.net/$1",
"ticket": "https://ticket.wikimedia.org/otrs/index.pl?Action=AgentTicketZoom&TicketNumber=$1",
"TMBW": "http://tmbw.net/wiki/$1",
"TmNet": "http://www.technomanifestos.net/?$1",
"TMwiki": "http://www.EasyTopicMaps.com/?page=$1",
"TokyoNights": "http://wiki.tokyo-nights.com/wiki/$1",
"Tools": "http://toolserver.org/$1",
"tswiki": "http://wiki.toolserver.org/view/$1",
"translatewiki": "http://translatewiki.net/wiki/$1",
"Trash!Italia": "http://trashware.linux.it/wiki/$1",
"Turismo": "http://www.tejo.org/turismo/$1",
"TVIV": "http://tviv.org/wiki/$1",
"TVtropes": "http://www.tvtropes.org/pmwiki/pmwiki.php/Main/$1",
"TWiki": "http://twiki.org/cgi-bin/view/$1",
"TwistedWiki": "http://purl.net/wiki/twisted/$1",
"TyvaWiki": "http://www.tyvawiki.org/wiki/$1",
"Unreal": "http://wiki.beyondunreal.com/wiki/$1",
"Urbandict": "http://www.urbandictionary.com/define.php?term=$1",
"USEJ": "http://www.tejo.org/usej/$1",
"UseMod": "http://www.usemod.com/cgi-bin/wiki.pl?$1",
"ValueWiki": "http://www.valuewiki.com/w/$1",
"Veropedia": "http://en.veropedia.com/a/$1",
"Vinismo": "http://vinismo.com/en/$1",
"VLOS": "http://www.thuvienkhoahoc.com/tusach/$1",
"VKoL": "http://kol.coldfront.net/thekolwiki/index.php/$1",
"VoIPinfo": "http://www.voip-info.org/wiki/view/$1",
"WarpedView": "http://www.warpedview.com/mediawiki/index.php/$1",
"WebDevWikiNL": "http://www.promo-it.nl/WebDevWiki/index.php?page=$1",
"Webisodes": "http://www.webisodes.org/$1",
"WebSeitzWiki": "http://webseitz.fluxent.com/wiki/$1",
"wg": "http://wg.en.wikipedia.org/wiki/$1",
"Wikianso": "http://www.ansorena.de/mediawiki/wiki/$1",
"Wikible": "http://wikible.org/en/$1",
"Wikichat": "http://www.wikichat.org/$1",
"WikiChristian": "http://www.wikichristian.org/index.php?title=$1",
"WikiF1": "http://www.wikif1.org/$1",
"WikiFur": "http://en.wikifur.com/wiki/$1",
"wikiHow": "http://www.wikihow.com/$1",
"WikiIndex": "http://wikiindex.com/$1",
"WikiLemon": "http://wiki.illemonati.com/$1",
"Wikilivres": "http://wikilivres.info/wiki/$1",
"WikiMac-de": "http://apfelwiki.de/wiki/Main/$1",
"WikiMac-fr": "http://www.wikimac.org/index.php/$1",
"Wikinfo": "http://www.wikinfo.org/index.php/$1",
"Wikinurse": "http://wikinurse.org/media/index.php?title=$1",
"Wikinvest": "http://www.wikinvest.com/$1",
"Wikipaltz": "http://www.wikipaltz.com/wiki/$1",
"Wikireason": "http://wikireason.net/wiki/$1",
"Wikischool": "http://www.wikischool.de/wiki/$1",
"wikisophia": "http://wikisophia.org/index.php?title=$1",
"Wikispot": "http://wikispot.org/?action=gotowikipage&v=$1",
"WikiTI": "http://wikiti.denglend.net/index.php?title=$1",
"WikiTravel": "http://wikitravel.org/en/$1",
"WikiTree": "http://wikitree.org/index.php?title=$1",
"Wipipedia": "http://www.londonfetishscene.com/wipi/index.php/$1",
"WLUG": "http://www.wlug.org.nz/$1",
"wmau": "http://wikimedia.org.au/wiki/$1",
"wmcz": "http://meta.wikimedia.org/wiki/Wikimedia_Czech_Republic/$1",
"wmno": "http://no.wikimedia.org/wiki/$1",
"wmrs": "http://rs.wikimedia.org/wiki/$1",
"wmse": "http://se.wikimedia.org/wiki/$1",
"wmuk": "http://uk.wikimedia.org/wiki/$1",
"wm2005": "http://wikimania2005.wikimedia.org/wiki/$1",
"wm2006": "http://wikimania2006.wikimedia.org/wiki/$1",
"wm2007": "http://wikimania2007.wikimedia.org/wiki/$1",
"wm2008": "http://wikimania2008.wikimedia.org/wiki/$1",
"wm2009": "http://wikimania2009.wikimedia.org/wiki/$1",
"wm2010": "http://wikimania2010.wikimedia.org/wiki/$1",
"wmania": "http://wikimania.wikimedia.org/wiki/$1",
"WMF": "http://wikimediafoundation.org/wiki/$1",
"Wookieepedia": "http://starwars.wikia.com/wiki/$1",
"World66": "http://www.world66.com/$1",
"WoWWiki": "http://www.wowwiki.com/$1",
"Wqy": "http://wqy.sourceforge.net/cgi-bin/index.cgi?$1",
"WurmPedia": "http://www.wurmonline.com/wiki/index.php/$1",
"WZNAN": "http://www.wikiznanie.ru/wiki/article/$1",
"Xboxic": "http://wiki.xboxic.com/$1",
"ZRHwiki": "http://www.zrhwiki.ch/wiki/$1",
"ZUM": "http://wiki.zum.de/$1",
"ZWiki": "http://www.zwiki.org/$1",
"ZZZ Wiki": "http://wiki.zzz.ee/index.php/$1",
}
def fixStyle(text):
pass
def getdateformat(text):
"""
ISO
DMY
MDY
"""
return 'DMY'
def fix(text="", page=None, verbose = True):
if not page:
page = wikipedia.Page(wikipedia.getSite(), 'Special:ParserTests')
if not text:
text=page.get()
#
## Hacks
#
text = text.replace('http://www.news.bbc.co.uk', 'http://news.bbc.co.uk')
# TODO: Fix accessyear/acessdate mismatch
# Peer Reviewer script had for sometime time convert URL into the following bad form
text = re.sub(r'\{\{[Cc]ite web\s*\|\s*url\s*=\s*http://(?P
[^{|}]+)\s*\|\s*title\s*=\s*(http://)?(?P=title)\s*()?\s*((\|format=(PDF|DOC))|(\|\s*accessdate *= *[^{|}]+))*\}\}', r'[http://\g]', text)
# a second time since we seem to hittings limits
text = re.sub(r'\{\{[Cc]ite web\s*\|url=(http://[^{|}]+)\s*\|title=([^{=}]+)(\|format=(PDF|DOC))?\}\}', r'[\1 \2]', text)
# Following the collapse of MiB preference PDFbot converts to the new format when saving
text = re.sub(r'\{\{(PDF(?:link)?\|[^{|}]+\|[\d\.]+) \[\[[^|]+\|([KMG])iB\]\]\}\}', r'{{\1 \2B}}', text)
# EN MOS -- Format Retrieved XXXX
# FIXME ]. [[Encyklopedia Interia]]. Last accessed 20 January 2007.
text = re.sub(r'((?<=[]["\'])|[.,(;?])(?P[)"\']|) *((URL *)?([Ll]ink *)?([Ll]ast *)?([Aa]ccess|reach)e?d?( +on| +online)?|[Rr]etrie?ved?) +(?P(((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w+|\[\[|20\d\d|\d|0\d|1\d|2\d|3[01]|\]\])[,.)\s\-]*?)+)[,.)\s]*', r'.\g Retrieved \g.', text)
# deprecated date linking, remove in citations
text = re.sub(r'\[\[(\d+ (?:January|February|March|April|May|June|July|August|September|October|November|December))\]\],? \[\[(\d{4})\]\](?=[^<>]*)', r'\1 \2', text)
text = re.sub(r'\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d+)\]\],? \[\[(\d{4})\]\](?=[^<>]*)', r'\1, \2', text)
#
## Comments
#
# Update {{NoMoreLinks}}
text = re.sub(r'(\n*\n)+', '{{subst:NoMoreLinks}}', text)
# Remove comment from the instroduction of footnotes
text = re.sub(r"\n?", '', text)
# Remove outdated comments
text = re.sub(r'\n?', '', text)
#### Some bad script
###text = re.sub(r'\{\{cite web\|url=([\d.\-]+)\|title=[^{|}]*\|last=([^{|}]+)\|first=([^{|}]+)\}\}', r'\2 \3. \1', text)
# Now that we got all the stuff that deals with comments out the way we can hide them to prevent mismatching
text = hideText(text)
if page.site().sitename() == 'wikipedia:en' and page.namespace() in [0, 2, 6]:
wikipedia.output("Apply English Wikipedia fixes")
text = formatEnglishWikipediaTemplate(page, text)
#
## HTML ##
#
# & to ''' & ''
text = re.sub(r"(?([^{|}<>\n']*?) (?!')", r"'''\1'''", text)
text = re.sub(r"(?([^{|}<>\n']*?)(?!')", r"''\1''", text)
# Standardize tables
text = re.sub(r'\n\|-+(?=[^{|}\n]*\n)', r'\n|-', text)
text = re.sub(r'\n\|-(?=\w)', r'\n|- ', text)
text = re.sub(r'\n\|-[^{}|<>\n]*(?=\n\|-)', r'', text)
text = re.sub(r'(\n\{\|[^][{}|<>\n]*)\n+(?=[|!][^+\-{}\n]+\n)', r'\1\n|-\n', text)
text = re.sub(r'\n\|-[^{}|<>\n]*\n*(?=\n\|\})', r'', text)
text = fixHTML(page,text)
saved = text # saved state
# Merge styles in a table
for property in ['text-align', 'vertical-align', 'font-size', 'font-family', 'color', 'background','background-color']:
text = re.sub(r'''
\|-([^\n{|}[\]]*?)( *
\|[^{|}[\]]*style="[^"]*('''+property+r''':[^;"]+;)[^"]*"[^{|}[\]]*\|[^|\n]*?((?:\n\|(?!-)|\|\|)[^{|}[\]]*style="[^"]*\3[^"]*"[^{|}[\]]*\|[^|\n]*)+)(?=
\|[-}])''', r'\n|-\1 style="\3" \2', text)
p = re.compile(r'''(
\|- style="[^"]*?('''+property+r''':[^";]+;)[^"]*?"[^\n{|}[\]]*(
\|(?!-)(?:[^[\]{|}]*\|[^\n]*?))*?
\|(?!-)[^{|}[\]]*style="[^"]*)\2 *(?=[^"]*"[^[\]{|}]*\|[^\n])''')
while p.search(text):
text = p.sub(r'\1', text)
if saved != text:
text = fixHTML(page,text)
#
## Hyperlinking ##
#
# Remove url junk (tracking, referrers, client info)
for i in range(0,9):
text = re.sub(r'(http://[^][<>\s"|])(&client=firefox-a|<=)(?=[][<>\s"|&])', r'\1', text)
text = text.replace('[{{SERVER}}{{localurl:', '[{{fullurl:') # Use magic words instead
# text = re.sub(r'\[http://en.wikipedia.org/w/index.php\?title=([^][<>"\s&=?]+)&?([^][<>"\s]*)', r'[{{fullurl:\1|\2}}', text)
# convert (see http://...) into , which is better handled by software
text = re.sub(r'(?i)[(](?:see|) *(http://[^][<>"\s(|)]+[\w=/&])\s?[)]', r'<\1>', text)
# From fixes.py
# external link in double brackets
text = re.sub(r'\[\[(?Phttps?://[^\]\n]+?)\]\]', r'[\g]', text)
# external link starting with double bracket
text = re.sub(r'\[\[(?Phttps?://.+?)\]', r'[\g]', text)
# pipe in url (unlikely to go wrong)
text = re.sub(r'\[(?Phttps?://[^][<>\s"\|;?]+?\.(aspx?|doc|f?cgi|html?|jsp|pdf|php|pl|ppt|rtf|txt|xml)) *\| *(?P[^\|\]]+?)\]', r'[\g \g]', text)
# Use of Image:
#if '[[Image:' in text:
# text = re.sub(r'(?i)\[\[(:?)File:([^][{|}]+\.(djvu|jpe?g|png|gif|svg|tiff))(?=\||\]\])', r'[[\1Image:\2', text)
text = re.sub(r'(?i)\[\[(:?)Image:([^][{|}]+\.(pdf|midi?|ogg|ogv|xcf))(?=\||\]\])', r'[[\1File:\2', text)
# Commons fixes for URLs
# TODO: remove domain name titles [http://example.com/aboutus.pdf example.com]
# | url= http://www.statcan.ca/english/sdds/instrument/3901_Q2_V2_E.pdf] (fx by removing the invalid [])
text = re.sub(ur'(http:/* *){2,}(?=[a-z0-9:.\-]+/)', 'http://', text) # Silently correct http://http:/
text = re.sub(ur"(\[\w+://[^][<>\"\s]+?)''", r"\1 ''", text) # corrects [http://''title''] (nospaces) -> [http:// ''title'']
text = re.sub(ur'(?u)\[\n*(\w+://[^][<>"\s]+ *(?:(?<= )[^\n\]<>]*?|))\n([^[\]<>{}\n=@/]*?) *\n*\]', ur'[\1 \2]', text) # Fix some links which were broken with a line break
text = re.sub(ur'(?u)\[(\w+://[^][<>"\s]+) +([Cc]lick here|[Hh]ere|\W|→|[ -/;-@]) *\]', ur'\2 [\1]', text) # remove unhelpful titles for screen readers
# Embedded images with bad anchors
text = re.sub(r'(?i)(\[\[(?:File|Image):[^][<>{|}]+)#(|filehistory|filelinks|file)(?=[\]|])', r'\1', text)
text = ext2intLinks(page, text)
text = simplifyLinks(page, text)
## References ##
# This is need because of Image1.jpg|caption[this is hidden]
text = fixReferences(page, text)
text = showText(text)
if noreferences and page.namespace() != 10 and page.title() != 'Special:ParserTests':
norefbot = noreferences.NoReferencesBot(None)
if norefbot.lacksReferences(text, verbose=False):
text = norefbot.addReferences(text)
return text
def formatEnglishWikipediaTemplate(page, text):
'''
hello
'''
# merge all variant of cite web
# make into {'dictname':(t1, t2, t3),}
text = re.sub(r'(?i)\{\{\s*(cite[_ \-]*(url|web|website)|Web[_ \-]*(citation|cite|reference|reference[_ ]4))(?=\s*\|)', '{{cite web', text)
## Unlink
# Remove formatting on certian parameters
text = re.sub(r"(\|\s*(?:agency|author|first|format|language|last|location|month|publisher|work|year)\s*=\s*)(''|'''|''''')((?:\[\[[^][|]+|\[\[|)[][\w\s,.~!`\"]+)(''+)(?=\s*\|[\w\s]+=|\s*\}\})", r'\1\3', text)
# Unlink well known publisher parameters (add work=?)
text = re.sub(r'(?i)(\|\s*(?:publisher|newpaper)\s*=\s*)\[\[(?:[Tt]he )?('+('|'.join(commonPublishers))+')\]\]', r'\1\2', text)
# Unlink PDF in format parameters
text = re.sub(r'(?i)(\|\s*format\s*=\s*)\[\[(adobe|portable|document|file|format|pdf|\.|\s|\(|\)|\|)+\]\]', r'\1PDF', text)
text = re.sub(r'(?i)(\|\s*format\s*=\s*)(\s*\.?(adobe|portable|document|file|format|pdf|\(|\)))+?(\s*[|}])', r'\1PDF\4', text)
# No |format=HTML says {{cite web/doc}}
text = re.sub(r'(?i)(\{\{cite[^{}]+)\|\s*format\s*=\s*(\[\[[^][|]+\||\[\[|)(\]\]| |html?|world|wide|web)+\s*(?=\||\}\})', r'\1', text)
## Fix parameters
# Fix accessdate tags [[WP:AWB/FR#Fix accessdate tags]]
text = re.sub(r'(\|\s*)a[ces]{3,8}date(\s*=\s*)(?=[^{|}]*20\d\d|\}\})', r'\1accessdate\2', text)
text = re.sub(r'accessdate(\s*=\s*)\[*(200\d)[/_\-](\d{2})[/_\-](\d{2})\]*', r'accessdate\1\2-\3-\4', text)
text = re.sub(r'(\|\s*)a[cs]*es*mou*nthday(\s*=\s*)', r'\1accessmonthday\2', text)
text = re.sub(r'(\|\s*)a[cs]*es*daymou*nth(\s*=\s*)', r'\1accessdaymonth\2', text)
text = re.sub(r'(\|\s*)accessdate(\s*=\s*[0-3]?[0-9] +(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*)([^][<>}{]*accessyear[\s=]+20\d\d)', r'\1accessdaymonth\2\3', text)
text = re.sub(r'(\|\s*)accessdate(\s*=\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w* +[0-3]?[0-9])([^][<>}{]*accessyear[\s=]+20\d\d)', r'\1accessmonthday\2\3', text)
text = re.sub(r'(\|\s*)accessdaymonth(\s*=\s*)\s*([^{|}<>]+?)\s*(\|[^][<>}{]*accessyear[\s=]+)(20\d\d)', r'\1accessdate\2\3 \5', text)
text = re.sub(r'(\|\s*)accessmonthday(\s*=\s*)\s*([^{|}<>]+?)\s*(\|[^][<>}{]*accessyear[\s=]+)(20\d\d)', r'\1accessdate\2\3, \5', text)
# Fix improper dates
text = re.sub(r'(?i)(\{\{cit[ea][^{}]+\|\s*date\s*=\s*\d{2}[/\-.]\d{2}[/\-.])([5-9]\d)(?=\s*[|}])', r'\g<1>19\g<3>', text)
text = re.sub(r'(?i)(\{\{cit[ea][^{}]+\|\s*date\s*=\s*)(0[1-9]|1[012])[/\-.](1[3-9]|2\d|3[01])[/\-.](19\d\d|20\d\d)(?=\s*[|}])', r'\1\4-\2-\3', text)
text = re.sub(r'(?i)(\{\{cit[ea][^{}]+\|\s*date\s*=\s*)(1[3-9]|2\d|3[01])[/\-.](0[1-9]|1[012])[/\-.](19\d\d|20\d\d)(?=\s*[|}])', r'\1\4-\3-\2', text)
# Fix URLS lacking http://
text = re.sub(r'(\|\s*url\s*=\s*)([0-9a-z.\-]+\.[a-z]{2,4}/[^][{|}:\s"]\s*[|}])', r'\1http://\2', text)
# Fix {{citation|title=[url title]}}
text = re.sub(r'(?i)(\{\{cit[ea][^{}]*?)(\s*\|\s*)(?:url|title)(\s*=\s*)\[([^][<>\s"]*) +([^]\n]+)\](?=[|}])', r'\1\2url\3\4\2title\3\5', text)
# Associated Press is usually the agency, not the work or publisher
text = re.sub(r'(?i)\{\{\s*[Cc]ite\s*(?:web|news|newpaper|article)([^{}]+?)(\s*\|\s*)(?:publisher|work|author)(\s*=\s*)(\[\[[^[\]|]+\||\[\[|)(?P%s)(\]\])?(?=\s*\|[^][{}]+=|\s*\}\})' % '|'.join(agencies), r'{{cite news\1\2agency\3Associated Press', text)
text = re.sub(r'(?i)(\{\{[^{}]+\|\s*url\s*=[^][{|}]+\.ap\.org/[^{}]+\|\s*)agency(\s*=\s*)Associated Press', r'\1work\2Associated Press', text)
text = re.sub(r'(?i)(\{\{[^{}]+\|\s*)agency(\s*=\s*)Associated Press([^{}]+\|\s*url\s*=[^][{|}]+\.ap\.org/)', r'\1work\2Associated Press\3', text)
# Fix pages=1 and page=20-44 and page=p. 22 , corner p. 23 section 5
# text = re.sub(r'(\{\{\s*(?:[Cc]ite (journal|news))[^{}]*\| *pages?\s*=\s*)(p[pg]?[. ]|pages?\b) *(?=[\d\-]+\s*[|}])', r'\1', text)
text = re.sub(r'(?iu)(\{\{\s*(?:cite (?:journal|news|book|web)|citation)[^{}]*?\|\s*)pages(?=\s*=\s*(p|pp|pg|page|pages|)\b[.:]?\s*\d+\s*(\||\}\}))', r'\1page', text)
text = re.sub(r'(?iu)(\{\{\s*(?:cite (?:journal|news|book|web)|citation)[^{}]*?\|\s*)page(?=\s*=\s*(p|pp|pg|page|pages|)\b[.:]?\s*\d+\s*[\-]\s*\d+\s*(\||\}\}))', r'\1pages', text)
# \n in title causes links to break
for m in re.finditer(r'\|\s*(?:title)\s*=\s*([^{|}]*?)\s*\|',text):
text = text.replace(m.group(), m.group().replace(m.group(1), m.group(1).replace('\n', ' ').replace('\r', ' ')))
# Change infoboxes from trailing pipes (likely stems from {{qif}} days)
p = re.compile(r'(\{\{[\w\s_]*[Ii]nfobox([^{}]*?\{\{[^{}]+\}\})*[^{}]*?[^{|}](= )?) *\| *\n ?(?=[\s\w]+=)', re.U)
while p.search(text):
text = p.sub(r'\1\n| ', text)
text = text.replace('|\n}}', '\n}}')
# Fix web.archive.org links
# TODO |url= web.archive -> url+archiveurl
# Note: correct web.archive.org/2008/en.wikipedia.org/page format
text = re.sub(ur'(\{\{(?:[Cc]ite web|[Cc]ite news|[Cc]ite|[Cc]itation)[^{}]*?)(\|\s*)url(\s*=\s*)(?Phttp://web.archive.org/web/(?P\d{4})(?P\d{2})(?P\d{2})\d{6}/(?Phttp://[^[\]<>"\s]+?))(\s*)(?=\||\}\})', ur'\1\2url\3\g\9\2archiveurl\3\g\9\2archivedate\3\g-\g-\g\9', text)
# Proper Capitilize ALL UPPERCASE names and titles
for m in re.finditer(r'(\|\s*(?:title|last|first|author)\s*=\s)([A-Z"\'\s.:;\-+0-9]{10,})(?=[{|}])', text):
s = m.group(2)
s = s.capitalize()
text=text.replace(m.group(), m.group(1)+s)
# basic implemnt of tpl_cite
for (find_template, replace_template, condition) in tpl_cite:
text = re.sub(ur'(\{\{\s*)(?:%s)((?=\s*\|)[^{}]*(%s)[^{}]*\}\})' % (find_template, condition), r'\g<1>%s\g<2>' % replace_template, text)
return text
def fixHTML(page, text):
'''
'''
# Remove old {{prettytable}} header row formatting
text = re.sub(r'(?i)(\n\{\| class="wikitable[^\n]+\n\|-[^\n]*)(bgcolor\W+CCC+|background\W+ccc+)(?=\W+\n!)', r'\1', text)
# has no visible effect on output next to a block level item
text = re.sub(r'(\n([^<\n]|<(?!br[^>]*>))+\w+[^\w\s<>]*) (?=\n[*#:;]|\n?]*>)\n?', r'\1', text)
text = re.sub(r'(?i)<[/]?br([^{/}<>]*?)>', r'
', text)
# Arrg! people are using this is templated tables as a way to visually align items! See [[Battle of Stalingrad]]
# text = re.sub(r'(
|\n *\n *){4,}', r'\n{{clear}}\n', text)
text = re.sub(r'(?i)
', r'{{-}}', text)
text = re.sub(r'
', r'{{clear\1}}', text)
# Use class="center" instead of
text = re.sub(r'(?i)]*)>((?:[^<]|<(?!/?\s*center\s*>))*) ', r'\2
', text)
# combine font tags
text = re.sub(r'(?i)(]*)> *\n?]*>)((?:[^<]|<(?!/?font))*? *\n?) ', r'\1\2\3', text)
#
text = re.sub(r'(?i)]*)>\[\[([^[\]{|}]+)\|([^[\]\n]*?)\]\] ', r'[[\2|\3 ]]', text)
#TODO look for single character entiys such as ; \ in markup, but ignore /
text = re.sub(r'(<(?P\w+)(?= +)|\n\{\||(?<=\n)\|-|(?P\n[!|]|!!|\|\|))(?P[^<>[\]{|}\n]+(?(tag)(?=>)|(?(cell)(?=[!|][^!|])|(?=\n))))', fixAttributes, text)
# Convert simple to
# NOTE: [[link|text]] transforms to [[link|text ]] by tidy
text = re.sub(r'(?!\[\[)((?:[^<]|<(?!/?font))*?)(?', r'\3 ', text)
removedTags = {}
for tag in re.findall(r'(?<=<)\w+(?=[^<>]*>)', text):
# Deprecated and removed elements
if tag in (
# Fonts style elements
"tt", "big", "small", "strike", "s", "u",
# Font modifier
"font", "basefont",
# Misc
"center", "dir"):
removedTags[tag] = removedTags.get(tag, 0) + 1
if removedTags:
wikipedia.output("\03{lightred}WARNING\03{default} : The following tags have been removed in the HTML 5 specifcation: %s" % ', '.join(('<%s> (%dx)' % t for t in removedTags.iteritems()) ))
return text
def fixAttributes(node):
tag = node.group('tag')
attr = node.group('attr')
if tag:
tag = tag.lower()
elif '{|' in node.group(1):
tag = "table"
elif '|-' in node.group(1):
tag = "tr"
if tag not in htmltags + (None, ):
return node.group()
# HACKS
attr = re.sub(r'border="2" cellpadding="4" cellspacing="0" style="margin: *1em 1em 1em 0; background: *#f9f9f9; border: *1px #aaa+ solid; *border-collapse: *collapse(; *font-size: *[89]\d%)?', r'class="wikitable" style="', attr)
# un-subst: {{prettytable}} and it dirvatives
attr = re.sub(r'(?i)([^<>\n]*)border\W+2\W+cellpadding\W+4\W+cellspacing\W+0"?', r' class="wikitable" \1', attr)
# p = re.compile(r'(class="wikitable[^<>\n]+ style="[^<>"\n]*?)(margin\W+1em\W+|1em\W+1em\W+0\W+|background\W+f9f9f9\W+|border\W+1px\W+#aa+a\W+solid\W+|border-collapse\W+collapse\W+|font-size\W+(100%|95%|1em)\W+)+(?=[^<>"\n]*")', re.I)
# while p.search(text):
# text = p.sub(r'\1', text)
# WHERE DID I GET THIS!?!: ([^][{}<>|="\'\s]*[0-9a-zA-Z%._]+[^][{}<>|="\'\s]*)
def quoteAttrib(m):
# r' \g="\g"'
return ' %s="%s"'%(m.group('attribute').lower(), m.group('value').strip())
# Quote attributes
attr = re.sub(r"""(?uix)[ ]*
\b(?P\w{2,}) [ ]*=[ ]* ["']?(?P
(?<=")[^"]*?(?=")|
(?<=')[^']*?(?=')|
[^<=>"' [\]{|}]+(?=[<> ]|$)
)["']?""", quoteAttrib, attr)
# Deprecated classes
attr = attr.replace(' class="prettytable', ' class="wikitable')
# Repair broken HTML
attr = re.sub(r'(?i) bgcolor="([A-Fa-f0-9]{6})"', r' bgcolor="#\1"', attr) # add hash to colors
attr = re.sub(r'(?i) colspan="1"', r'', attr)
attr = re.sub(r'(?i) rowspan="1"', r'', attr)
# # move class= to the front
# attr = re.sub(r'^(\s*)( [^][{|}<>]+)?( class="[^"]+"(?=\s|\Z))', r'\1\3\2', attr)
if tag == 'table':
# TODO move me
# Tables
attr = re.sub(r'(?i) align="(left|right)"', r' style="float:\1;" ', attr)
attr = re.sub(r'(?i) align="center"', r' style="margin:auto;" ', attr)
attr = re.sub(r'(?i) align="(\w+)"', '', attr)
elif tag == 'div':
attr = re.sub(r'(?i) align="(left|right)"', r' style="float:\1;"', attr)
attr = re.sub(r'(?i) align="center"', r' class="center"', attr)
if tag == 'table':
attr = re.sub(r'(col|row)span=("1"|1)(?=\D)', r'', attr)
#attr = attr.replace('cellspacing="0"', 'style="border-collapse:collapse; "')
if 'border=' not in attr:
# See [[MediaWiki talk:Common.css# Wikitable borders without CSS]]
attr = re.sub(r'class="wikitable([^"\'{|}]*)"( *border="?1"?)*', r'class="wikitable\1" border="1"', attr)
if re.search('float: *right', attr) and 'toccolours' in attr and node.start() < 400:
# floats right, and near the top, gotta be a infobox
attr = re.sub(r'class="toc(colours|)', r'class="infobox', attr)
attr = re.sub(r'float: *right;|margin[^:;="]*:[^:;="]+|border="1"', r'', attr)
# border-collapse is not exactly the same but it's close enough
#attr = re.sub(r' cellspacing="0"', r' style="border-collapse:collapse;"', attr)
if 'class="wikitable' in attr:
attr = re.sub(r'(?i)(border:)( 1px| #aaa+| solid)+',r'\1', attr)
attr = re.sub(r'(?i) border="?([0-9])"?', r'', attr)
attr = re.sub(r'(?i) cellspacing="?([0])"?', r'', attr)
attr = re.sub(r'(?i) cellpadding="?([2-4])"?', r'', attr)
attr = re.sub(r'(?i)margin: ?1em 1em 1em 0', r'', attr)
attr = re.sub(r'(?i)background: ?#f9f9f9', r'', attr)
attr = re.sub(r'(?i)border-collapse: ?collapse', r'', attr)
attr = re.sub(r'font-size: ?(100%|1em)', r'', attr)
#if # avoid float: position: etc..
#attr = re.sub(r'font-size: ?\.?9\d(%|em)', r'', attr)
# replace with CSS
attr = re.sub(r'(?i) align="(left|center|right|justify)"', r' style="text-align:\1;"', attr)
attr = re.sub(r'(?i) bgcolor="([^"]+?)"', r' style="background-color:\1;"', attr)
#attr = re.sub(r'(?i) border="?([1-9])"?', r' style="border:\1px;"', attr)
attr = re.sub(r'(?i) color="([^"]+?)"',r' style="color:\1;"', attr)
attr = re.sub(r'(?i) clear="(left|right)"', r' style="clear:\1;"', attr)
attr = re.sub(r'(?i) clear=" *all *"', r' style="clear:both;"', attr)
attr = re.sub(r'(?i) face="([^"]+?)"', r' style="font-family:\1;"', attr)
attr = re.sub(r'(?i) height="([^"]+?)"', r' style="height:\1;"', attr)
attr = re.sub(r'(?i) nowrap(="(nowrap|yes|true)"|(?= )|$)', r' style="white-space:nowrap;"', attr)
attr = re.sub(r'(?i) size="(\d+(em|%|px|pt))"', r' style="font-size:\1;"', attr)
attr = re.sub(r'(?i) valign="([^"]+?)"', r' style="vertical-align:\1;"', attr)
attr = re.sub(r'(?i) width="([^"]+?)"', r' style="width:\1;"', attr)
# font size="#" render browser dependent, W3C leaves it open
fontSizeConvert = {'1':'0.8em','2':'1em','3':'1.2em','4':'1.4em','5':'1.9em','6':'2.4em','7':'3.7em',
'-4':'50%','-3':'60%','-2':'70%','-1':'80%','0':'100%',
'+1':'120%','+2':'140%','+3':'160%','+4':'180%','+5':'200%','+6':'250%','+7':'300%',}
for n in re.finditer(r' size="([1-7]|[+-][0-6])"', attr):
attr = attr.replace(n.group(), r' style="font-size:%s;"'%fontSizeConvert[n.group(1)])
# merge style attributes together
stylemerge = re.compile(r' style="([^"{|}\n]+?);* *"([^][!<>{|}\n]*?) style="([^"{|}\n]+)"')
while stylemerge.search(attr):
attr = stylemerge.sub(r'\2 style="\1; \3"', attr)
# Fix up style parameters
for styleMatch in re.finditer(r' style="([^[\]{|}\n]*?)"', attr):
styleText = fixCSS(styleMatch.group(1))
attr = attr.replace(styleMatch.group(), styleText and ' style="%s"'%styleText or '')
if '=' in styleText:
wikipedia.output("\03{lightyellow}WARNING\03{default} : U+003D EQUALS SIGN (=) character found in style attribute")
# Remove all non approved attributes
for m in re.finditer(r'(?<= )(\w+)(="[^"]+"| +(?=\w)| *$| *>)', attr):
if m.group(1).lower() not in htmlattrs and tag:# HACK remove when proper table support is in
wikipedia.output("\03{lightred}REMOVED\03{default} : Invalid attribute %s" % (m.group(),))
attr = attr.replace(m.group(), '')
elif m.group(2) == '=""':
wikipedia.output("Emptry attribute")
else:
attr = attr.replace(m.group(), m.group(1).lower() + m.group(2))
# Alert user about deprecated html attributes
# FIXME this should be split up into General, Table, Font
# TODO add border=
if m.group(1).lower() in "align|alink|background|bgcolor|border|cellspacing|cellpadding|clear|compact|color|face|height|hspace|link|noshade|nowrap|size|start|text|type|value|valign|vlink|width|vspace".split('|'):
wikipedia.output("\03{lightred}DEPRECATED\03{default} : %s attribute (in %s) "%(m.group(), tag or "Table"))
wikipedia.output(node.group().strip())
# put back in
if re.sub(r'[ ;"]', '', node.group('attr').lower()) != re.sub(r'[ ;"]', '', attr.lower()) and len(attr) < len(node.group('attr')) * 2:
return ''.join((node.group(1).lower(), attr.rstrip() ))
else:
return node.group()
def fixCSS(styleText):
#TODO
# add filter for value and dictionary units
# Stylistics changes
styleText += ';' # add then remove
styleText = re.sub(r' *: *', ':', styleText)
styleText = re.sub(r' *(; *)+', '; ', styleText)
# Remove "float; ..." and "float:;"
styleText = re.sub(r'(\A *|;)([^;:=]*:? *;)+', r'\1', styleText)
styleText = re.sub(r'[\w\-\s]:; ', '', styleText)
styleText = re.sub(r'(background|color):([a-fA-F0-9]{3,6})', r'\1:#\2', styleText)
if styleText.count('background') == 1:
styleText = styleText.replace('background-color:', 'background:')
# Background:none is shorter than background-color:transparent, but resets image related properties
# We also assume that people will not set anything else since background-image: is filtered out
# See: [[User:Chris Chittleborough/CSS-notes]]
styleText = re.sub(r'background:[^:;]*transparent[^:;]*;', r'background:none;', styleText)
# Assumed units
styleText = re.sub(r'(width|height):(\d{2,});', r'\1:\2px;', styleText)
styleText = re.sub(r'((?:background|border|border|color)(?:-color)?):([a-fA-F0-9]{3,6})(?=[ ;])', r'\1:#\2', styleText)
# Fix units
styleText = re.sub(r'\b(width|height|border|margin|padding):(\d{2,}|[1-9])(?=[; ])', r'\1:\2px;', styleText)
styleText = re.sub(r'(?<=[ :]0)(em|%|px|pt)(?=[ ;])', "", styleText)
# IE color compatiblity
styleText = re.sub(r'(?i)\bgrey\b', r'gray', styleText)
styleText = re.sub(r'(?i)(dark|dim|light|lightslate|slate)gr[ae]y', r'\1grey', styleText)
# Shorten CSS color values
for m in re.finditer(r'#(?:[0-9a-fA-F]{6}|[0-9a-fA-F]{3})(?=[ ;!])', styleText):
if re.search(r'(?i)#(00|11|22|33|44|55|66|77|99|aa|bb|cc|dd|ee|ff){3}', m.group().lower() ):
styleText = styleText.replace(m.group(), re.sub(r'(?ui)#([0-9a-f])[0-9a-f]([0-9a-f])[0-9a-f]([0-9a-f])[0-9a-f]', r'#\1\2\3', m.group().lower() ))
elif m.group().upper() in namedColors:
styleText = styleText.replace(m.group(), namedColors[m.group().upper()])
else:
styleText = styleText.replace(m.group(), m.group().lower())
# use mirroring
styleText = re.sub(r'(margin|padding):(?P-?[\.0-9]+[a-zA-z]+|0)( (?P=v))+;', r'\1:\2;', styleText)
styleText = re.sub(r'(margin|padding):(-?[\.0-9]+[a-zA-z]+|0) (-?[\.0-9]+[a-zA-z]+|0) \2 \3;', r'\1:\2 \3;', styleText)
styleText = re.sub(r'(margin|padding):(-?[\.0-9]+[a-zA-z]+|0) (-?[\.0-9]+[a-zA-z]+|0) (-?[\.0-9]+[a-zA-z]+|0) \3;', r'\1:\2 \3 \4;', styleText)
return styleText.strip()
def ext2intLinks(page, text):
text = re.sub(r'\[http://upload.wikimedia.org/wikipedia/(?:commons|%s)/[0-9A-Fa-f]/[0-9A-Fa-f]{2}/([^[\]<>\s?]+) *((?<= )[^\n\]]+)\]' % (page.site().language()), r'[[Media:\1|\2]]', text)
text = re.sub(r'\[http://upload.wikimedia.org/wikipedia/(?:commons|%s)/[0-9A-Fa-f]/[0-9A-Fa-f]{2}/([^[\]<>\s?]+)\]' % (page.site().language()), r'[[[Media:\1]]]', text)
text = re.sub(r'\[http://(www\.toolserver\.org|toolserver\.org|tools\.wikimedia\.org|tools\.wikimedia\.de)/([^][<>"\s;?]*)\?? ([^]\n]+)\]', r'[[tools:\2|\3]]', text)
if page.namespace() == 0:
# [[WP:SELF]] states that we shouldn't cross link from the main namespace
text = re.sub(r'''(?ix)\[http://([a-z]{3}(?:-[a-z]+)*)\.(?:
(wikt)ionary|
wiki(n)ews|
wiki(b)ooks|
wiki(q)uote|
wiki(s)ource|
wiki(v)ersity)\.(?:com|net|org)/wiki/
(?![_ :]*(?:Talk|Help|User|Wikipedia|Wikinews|Wikibooks|wikiquote|wikisource|wikiversity|Portal|MediaWiki)(?:[ _]talk)?:)
([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\2\3\4\5\6\7:\1:\8|\9]]', text)
text = re.sub(r'''(?ix)\[http://(meta|commons|incubator|quality)
\.wikimedia\.(?:com|net|org)/wiki/
(?![_:]*(?:Talk|Help|User|Meta|commons|incubator|quality|Portal|MediaWiki)(?:_talk)*:)
([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\1:\2|\3]]', text)
else:
text = re.sub(r'''(?ix)\[http://([a-z]{3}(?:-[a-z]+)*)\.(?:
(wikt)ionary|
wiki(n)ews|
wiki(b)ooks|
wiki(q)uote|
wiki(s)ource|
wiki(v)ersity)\.(?:com|net|org)/wiki/
([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\2\3\4\5\6\7:\1:\8|\9]]', text)
text = re.sub(r'''(?ix)\[http://(meta|commons|incubator|quality)
\.wikimedia\.(?:com|net|org)/wiki/
([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\1:\2|\3]]', text)
text = re.sub(r'''(?ix)\[http://([a-z0-9\-]+)\.wikia\.(?:com|net|org)/wiki/
([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[wikia:\1:\2|\3]]', text)
# Reverse interwiki map
# [0-9A-Za-z\-.:_] not escaped
# [;:@$!*(),/] are converted back in GlobalFunctions.php
# [_#\'\\^`~] are assumed to be safe
#conflict = {}
for (prefix, map) in interwiki_map.iteritems():
# Expensive overlap test
#if map in conflict:
# print("Collision in interwiki map [[%s:]] and [[%s:]] on %s " % (prefix, conflict[map], map))
#else:
# conflict[map] = prefix
#for a,b in interwiki_map.iteritems():
# if b.find(map) == 0 and a != prefix:
# print("Overlap between interwiki map [[%s:]] (%s) and [[%s:]] (%s) " % (prefix, map, a, b))
text = re.sub(r'\[%s +([^\n\[\]]+)\]'%re.escape(map).replace('\\$1', r'([0-9A-Za-z\-.;;:@$!*(),/_#\'\\^`~]*)'), r"[[%s:\1|\2]]"%prefix, text)
return text
def canonicalTitle(title, ucfirst=True): # TODO rename keyword
"""
Converts unicode or bytes string to mw titles
support: percent-encoded UTF-8, HTML character references
"""
try: title = unicode(title)
except:
try:title = unicode(title, 'utf-8')
except:title = unicode(title, 'latin-1')
# HTML character references
title = wikipedia.html2unicode(title)
# Unpercent-encode
title = unicode(wikipedia.urllib.unquote(title.encode('utf-8')), 'utf-8')
# Underscore to space and Strip space
title = title.replace('_', ' ').strip()
# Merge multiple spaces
while ' ' in title:
title = title.replace(' ', ' ')
# First uppercase
if ucfirst and title:
title = title[0].upper() + title[1:]
# Strip the section part
if '#' in title:
title = title[:title.index('#')]
return title
def simplifyLinks(page, text):
def dot2percent(m): return m.group().replace('.', '%')
# Prettify links, remove underscore and decode characters
for m in re.finditer(ur'\[\[([^[\]{|}\n]+)\|([^\n|]*?)\]\]', text):
link = m.group(1).replace('_', ' ').encode('utf-8')
if '#' in link:
title, anchor = link.split('#', 1)
anchor = anchor.replace('%', '.25')
anchor = re.sub(r'''(?x)
# Single byte character (Printable ASCII)
# we make that [0-9A-Za-z\-.:_] and [[\]{|}] are not included
\.2[1-9A-CF]
|\.3[BD-F]
# We need to avoid encoding and
|\.3C(?!\w|/\w|\.2F\w)
|\.40
|\.5[CE]
|\.60
|\.7E
# skip .8-B\h
# Two byte UTF-8 character U+0080-U+07FF
|\.[CD][0-9A-F]\.[89AB][0-9A-F]
# Three byte UTF-8 character U+0800-U+FFFF
|\.E[0-9A-F]\.[89AB][0-9A-F]\.[89AB][0-9A-F]
# Four byte UTF-8 character U+10000-U+10FFFF
|\.F[0-7]\.[89AB][0-9A-F]\.[89AB][0-9A-F]\.[89AB][0-9A-F]
''', dot2percent, anchor)
link = ''.join((title, '#', anchor))
link = urllib.unquote(link) # unescape %xx
# Specific formating
if link.startswith('tools:'):link = link.replace(' ', '_')
link = link.replace('# ', '#') # get ride of copy/paste space
link = unicode(link, 'utf-8')
#if m.group(2)[0:1].islower():
#if m.group(1) != link
if not any((s.isupper() for s in link[1:])) and not any((s.isupper() for s in m.group(2))):
if re.search(r'(?i)\[\[(\w{3,})\w{0,3}[()_ |[\]].*?\b\1', m.group()):
# Come up with better huristics
link = link[0].lower() + link[1:]
text = text.replace(m.group(), '[[%s|%s]]'%(link, m.group(2)))
# Simplify links
# FIXME use canonicalTitle
# [[A|AB]] -> [[A]]B
text = re.sub(ur'(?u)\[\[([^{|}[\]]+)\|\1(\w*)\]\]', ur'[[\1]]\2', text)
## A[[ABC|B]]C -> [[ABC]]
#text = re.sub(ur'(?ui)([^{|}[\]]* *) *\[\[ *\1([^{|}[\]]+ *)( *[^{|}[\]]*) *\| *\2\]\]\3', ur'[[\1\2\3]]', text)
# TODO
# unbypass redirect change [[Light_cycle#Light_cycles]] and [[Tron_(film)#Light_cycles]] to the redirect [[Light cycle]]
# find redirect such that A [[Article |B]] C to [[A B C]]
return text
def fixReferences(page, text):
# Standardize to lowercase reference name, makes things easier for everyone
text = re.sub(r'(?uis)<(/?)REF\b([^>]*)>', r'<\1ref\2>', text)
# it should be name = " or name=" NOT name ="
text = re.sub(r'[)', r'http://\1', text)
text = re.sub(r'(?<=][)\s*\[?(?:http://)?([a-z0-9\-\.]*?[a-z0-9\-]+\.[a-z\.]{2,6}/[^][<>\s"|]+) +([^][{|}<>\n/]+?)\]?\s*(?=])', r'[http://\1 \2]', text)
# TODO: Fix the below [ref] to [[url]] conversion
text = re.sub(r'(?is)[\s*(\[\w+://[^][<>"\s]+\s*\])\s*(\[\w+://[^][<>"\s]+\s*\])\s*]', r'[\1][\2]', text)
## Badly formed references
# Fake reference ([url] )
text = re.sub(r'(?i)\s*\[(\w+://[^][<>"\s]+) *\]\s* ', r'[\1]', text)
# Bracket to reference conversion
# BUG matches
for i in range(8):
#text = re.sub(r'(?miu)(^[^*#;:= ]{1,4}.{4,}?)(?\s"]{8,})\s*\](?![^-]*-->)(?!([^<]|<(?!ref))*)', r'\1[\2]', text)
# testing
text = re.sub(r'(?miu)(^[^#;:= ]{1,4}.{4,}?)(?<=[^*#]{15})(?\s"]{8,})\s*\](?![^-]*-->)(?!([^<]|<(?!ref))*)', r'\1[\2]', text)
# remove invalid references
text = re.sub(r'(?i)[ *]', '', text)
## Format Punctuation
# Applied if "[,.;:]" is dominate
if len(re.findall(r'[.,;:] *\s?[ len(re.findall(r'(?:]|]+/>) *\s?[.,;:]', text)):
# Move punctuation left and space right but before \n
text = re.sub(r'(?s)(?<=[\w")\]])( *)((?: *\s??[]+?/>| *\s??][]*?>(?:[^<]|<(?!/?ref>))*?])+)( *)\n?([.,]|(?]+?/>| *\s??[]*?>(?:[^<]|<(?!/?ref>))*?])+)(?= *\s?[^\s<>])', r'\2\1', text)
# Remove duplicate punctuation
text = re.sub(r'(?s)(?P[.,;:])(["]?(?:[]+?/> *\s?|][]*?>([^<]|<(?!/?ref>))*?] *\s?)+)(?P=punc)(?![.,]|(?|[]+?/>) +(]
text = re.sub(r'(|[]+?/>)()((\'{2,5}|)[\w"(\[])', r'\1 \3', text)
text = re.sub(r'(]|[]+?/>)( {3,})([\w(\[])', r'\1 \3', text)
elif len(re.findall(r'(?:]|]+/>) *\s?[.,;:]', text)) > 10:
wikipedia.output('\03{lightyellow}ALERT\03{default}: Punctuation after the references is the dominate format!')
# Merge duplicate refs
# TODO seperate reference group from naming
for m in re.finditer(r'(?si)([)(.*?)(])', text):
# Skip single references
if text.count(m.group()) <= 1:
continue
# Get a meaningful word part
for p in (r'\|\s*last\s*=(\w+)', # Reference template: | last = LASTNAME
r'[Bb][Yy] +[A-Z][a-z]+ +([A-Z][a-z]+)[.,\'"]',
r'^((?:Mc|)[A-Z][a-z])[,.]', # First word, must be capitalized and followed by punctuation
r'(?s)\w+://[a-z0-9\-\.]*?([a-z0-9\-]+)\.[a-z\.]{2,6}[ /|=!]', # Website DOMAIN
r'(?s)^(?:\[\[[^][]+\|)?((? 4 and match.group(1).lower() not in ignoreAsNames:
refname = match.group(1)
break
else:
refname = 'autogenerated' # Default name
# try for the longest Capitalized word
for n in re.findall(r'\b(?:Mc)?[A-Z][a-z]+\b', re.sub(r'\|[^{|}=]+=|\{\{[^{|}]+\||\[\[^][|]+\|', ' ', m.group(2) )):
if len(n) > len(refname):
refname = n
# Remove non-letters to avoid names like "rescue007"
refname = refname.strip('\t\r\n 0123456789-').lower()
# Get a number
for p in (r'\|\s*(?:pages|page|p|pp)\s*=\s*(\d+)',
r'\b(?:pages|page|p|pp|pg)[.:= ]*(\d{1,4})\b[\w\s\.\-<&\]]*',
r'\|\s*year\s*=\s*(\d{4})',
r'\b(19\d\d|200[0-7])\b',
r'\b([mclxvi]*[clxvi]{2,6})(?:\b|\.)' ):
match = re.search(p, re.sub(r'accessdate\s*=[^{|}]*|Retrieved [\s\w\[\],]+', ' ', m.group(2)) )
if match and refname+match.group(1) not in text:
refname = refname+match.group(1)
break
else:
i = 1
while refname+str(i) in text: i+=1
else: refname += str(i)
# the replacement name should be 50% smaller
if len(m.group(2)) * 0.50 > len(refname) + 8:
text = text.replace(m.group(), '[%s]' % (refname, m.group(2)), 1)
text = text.replace(m.group(), '' % refname)
# remove formatting wrappers (adapted from AWB)
m = re.search(r'(?i)(<(span|div)( class="(references-small|small|references-2column)"|)>\s*){1,2}\s* (\s*(span|div)>){1,2}', text)
if m and m.group().count('') > 30:
text = re.sub(r'(?is)(=\s+()*\s*)(\{\{Cleanup-link rot[^{}]*\}\}\s*)?( |\{\{(?:Listaref|Reference|Refs|Reflist|Refs)\|?[134]?\}\})', r'\1{{reflist|colwidth=30em}}', text)
elif text.count('') < 8:
text = re.sub(r'(?is)(=\s+)\{\{reflist\|(\d+|colwidth=\d+\w+)\}\}', r'\1{{reflist}}', text)
else:
pass
return text
def correctdate(s):
pass
def wiki_table(match):
return match.group()
def html_attrib(match):
return match.group()
##
hideTokens = {}
hideRegex = re.compile('|'.join([
r'',
r' .*? ',
r' .*? ',
r' .*? ',
r' ',
r'',
r'.*? ',
r'.*?',
]), re.I | re.S)
def hideText(text):
global hideTokens
n=111
for m in hideRegex.finditer(text):
n+=1
hideTokens[n] = m.group()
text = text.replace(m.group(), u'⌊⌊⌊⌊%06d⌋⌋⌋⌋'%n)
return text
def showText(text):
global hideTokens
for (key, value) in hideTokens.items():
text = text.replace(u'⌊⌊⌊⌊%06d⌋⌋⌋⌋'%key, value)
if re.search(ur'(?u)⌊⌊⌊⌊\d{6,}⌋⌋⌋⌋', text):
wikipedia.output("WARNING: Unable to replace all hidden tokens")
raise "Please report this problem at [[User talk:Dispenser]]"
hideTokens = {} # Empty
return text
def main():
gen = None
namespaces = []
genFactory = pagegenerators.GeneratorFactory()
summary = "Applying general fixes for links, HTML, and/or references"
for arg in wikipedia.handleArgs():
#elif arg.startswith('-namespace:'):
# try:
# namespaces.append(int(arg[11:]))
# except ValueError:
# namespaces.append(arg[11:])
if arg == '-test' or arg.startswith('-test:'):
f = open('./text/%s'%(arg[6:].replace('/', '|') or 'Tests.html'))
test = unicode(f.read(), 'utf-8')
site = wikipedia.getSite()
page = wikipedia.Page(site, 'ParserTests')
# Disable cgitb disk loggging
import cgitb; cgitb.enable()
wikipedia.output("Default site: %s"%site.sitename())
# hackist seek
wikipedia.showDiff(test, fix(text=test, page=page))
import parser
print '''
''' % (parser.parser(test).encode('utf-8'), parser.parser(fix(text=test, page=page)).encode('utf-8'))
return
else:
genFactory.handleArg(arg)
if not gen:
gen = genFactory.getCombinedGenerator()
if not gen:
wikipedia.showHelp('commonfixes')
return
for page in gen:
try:
page.get()
except wikipedia.NoPage:
wikipedia.output('%s does not exist!' % page.aslink())
continue
except wikipedia.IsRedirectPage:
wikipedia.output(u'Page %s is a redirect' % page.aslink())
continue
text = fix(page=page)
if text != page.get():
wikipedia.showDiff(page.get(), text)
wikipedia.setAction(summary)
page.put(text)
else:
print 'No changes necessary'
if __name__ == "__main__" and wikipedia.handleUrlAndHeader():
try:
wikipedia.startContent(form=True)
main()
finally:
wikipedia.endContent()
wikipedia.stopme()
|