#!/usr/bin/python # -*- coding: utf-8 -*- """ ¶ms; -test Test the routines used for regession testing -namespace:n Number or name of namespace to process. The parameter can be more than one to add additional namespaces commonfixes applied fixes which are general and specific to the English Wikipedia """ # TODO # TIP: use "%(dictname)s" % groupdict() a # better ref combining , combine urls and on ignoring a list of character (matching) # Seperate English from generic wikisyntax # Seperate enwiki sepefic # steel stuff from # http://en.wikipedia.org/wiki/User:Polbot/source/Reffix.pl # FIXME: # http://en.wikipedia.org/w/index.php?title=London&diff=prev&oldid=253531178 (infobox) # http://en.wikipedia.org/w/index.php?title=Hoover_Dam&diff=prev&oldid=253529821 # FIXME: # http://en.wikipedia.org/w/index.php?title=Rolls-Royce_RR300&diff=190562064&oldid=175311735 # http://www.nationaltrust.org/magazine/archives/arc_news_2007/010807.htm # http://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1186&context=theses import re, urllib import wikipedia, pagegenerators try: import noreferences except ImportError: noreferences = None if True: import MySQLdb else: MySQLdb = False docuReplacements = { '¶ms;': pagegenerators.parameterHelp, } ignoreAsNames = ( 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ) # NOT IMPLEMENTED PROPERLY # Will change work/publisher cite news and |agency="dictvalue" agencies = { "AP": "Associated Press", "The Associated Press": "Associated Press", "Associated Press": "Associated Press", "AP News": "Associated Press", # "DPA": "Deutsche Presse-Agentur", # "AFP": "Agence France-Presse", } # "The" will be stripped if it exist # So don't include Edge case e.g. "People" and "The People" commonPublishers = ( "American Journalism Review", "Associated Press", "BBC News", "BBC", "Boston Globe", "Chicago Tribune", "CNN", "Daily Telegraph", "Economist", "Guardian", "Huffington Post", "International Herald Tribune", "MTV", "New York Times", "NY Times", "Observer", "The Times", "The Register", "San Francisco Chronicle", "Scientific American", "Seattle Times", "Reuters", "Rolling Stone", "Wall Street Journal", "Washington Post", # Web only sources "IGN", "GameStop", "Electronic Gaming Monthly", "Kotaku", "Ars Technica", "Joystiq", "Tom's Hardware", "Salon", "United Press International", # since 1907 ) # template choser # not implemented yet tpl_cite = ( # Match templates, replace template, regex condition ('cite web', 'cite encyclopedia', r'\|\s*url\s*=\s*http://(www\.)?(encarta.com|encarta.msn.com|betanitca.com)'), ('cite web', 'cite news', r'\|\s*url\s*=\s*http://(www\.)?(nytimes.com|ap.google.com|news\.bbc\.co\.uk|time\.com|economist\.com|timesonline\.co\.uk|channelonline\.tv|cnn\.com|independent\.co\.uk|cbc.ca|theglobeandmail.com)/'), ('cite web', 'cite paper', r'\|\s*url\s*=\s*http://(www\.)?(havard.edu)'), ('cite web', 'cite news', r'\|\s*agency\s*='), ('cite web', 'cite book', r'\|\s*isbn\s*=\s*[^\s{|}[\]]'), ) htmltags = ( # pairs "b", "i", "u", "font", "big", "small", "sub", "sup", "h1", "h2", "h3", "h4", "h5", "h6", "cite", "code", "em", "s", "span", "strike", "strong", "tt", "var", "div", "center", "blockquote", "ol", "ul", "dl", "table", "caption", "pre", "ruby", "rt" , "rb" , "rp", # single "br", "p", "hr", "li", "dt", "dd", # nest "table", "tr", "td", "th", "div", "blockquote", "ol", "ul", "dl", "font", "big", "small", "sub", "sup", # table tags "td", "th", "tr", ) htmlattrs = ( "title", "align", "lang", "dir", "width", "height", "bgcolor", "clear", "noshade", "cite", "size", "face", "color", "type", "start", "value", "compact", #/* For various lists, mostly deprecated but safe */ "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "valign", "char", "charoff", "colgroup", "col", "span", "abbr", "axis", "headers", "scope", "rowspan", "colspan", "id", "class", "name", "style" ) # CSS HEX color values to named (<9 chars) color table namedColors = {'#00FFFF': 'aqua', '#F0FFFF': 'azure', '#F5F5DC': 'beige', '#FFE4C4': 'bisque', '#000000': 'black', '#0000FF': 'blue', '#A52A2A': 'brown', '#FF7F50': 'coral', '#FFF8DC': 'cornsilk', '#DC143C': 'crimson', '#00FFFF': 'cyan', '#00008B': 'darkBlue', '#008B8B': 'darkCyan', '#A9A9A9': 'darkGray', '#A9A9A9': 'darkGrey', '#8B0000': 'darkRed', '#FF1493': 'deepPink', '#696969': 'dimGray', '#696969': 'dimGrey', '#FF00FF': 'fuchsia', '#FFD700': 'gold', '#808080': 'gray', '#808080': 'grey', '#008000': 'green', '#F0FFF0': 'honeyDew', '#FF69B4': 'hotPink', '#4B0082': 'indigo', '#FFFFF0': 'ivory', '#F0E68C': 'khaki', '#E6E6FA': 'lavender', '#00FF00': 'lime', '#FAF0E6': 'linen', '#FF00FF': 'magenta', '#800000': 'maroon', '#FFE4B5': 'moccasin', '#000080': 'navy', '#FDF5E6': 'oldLace', '#808000': 'olive', '#FFA500': 'orange', '#DA70D6': 'orchid', '#CD853F': 'peru', '#FFC0CB': 'pink', '#DDA0DD': 'plum', '#800080': 'purple', '#FF0000': 'red', '#FA8072': 'salmon', '#2E8B57': 'seaGreen', '#FFF5EE': 'seaShell', '#A0522D': 'sienna', '#C0C0C0': 'silver', '#87CEEB': 'skyBlue', '#FFFAFA': 'snow', '#D2B48C': 'tan', '#008080': 'teal', '#D8BFD8': 'thistle', '#FF6347': 'tomato', '#EE82EE': 'violet', '#F5DEB3': 'wheat', '#FFFFFF': 'white', '#FFFF00': 'yellow', } # Interwiki map for converting links to interwiki form # Table format | NAME || URI interwiki_map = { "AbbeNormal": "http://ourpla.net/cgi/pikie?$1", "Acronym": "http://www.acronymfinder.com/af-query.asp?String=exact&Acronym=$1", "advisory": "http://advisory.wikimedia.org/wiki/$1", "Advogato": "http://www.advogato.org/$1", "Aew": "http://wiki.arabeyes.org/$1", "Airwarfare": "http://airwarfare.com/mediawiki-1.4.5/index.php?$1", "AIWiki": "http://www.ifi.unizh.ch/ailab/aiwiki/aiw.cgi?$1", "AllWiki": "http://allwiki.com/index.php/$1", "Appropedia": "http://www.appropedia.org/$1", "AquariumWiki": "http://www.theaquariumwiki.com/$1", "arXiv": "http://arxiv.org/abs/$1", "AspieNetWiki": "http://aspie.mela.de/index.php/$1", "AtmWiki": "http://www.otterstedt.de/wiki/index.php/$1", "BattlestarWiki": "http://en.battlestarwiki.org/wiki/$1", "BEMI": "http://bemi.free.fr/vikio/index.php?$1", "BenefitsWiki": "http://www.benefitslink.com/cgi-bin/wiki.cgi?$1", "BibleWiki": "http://bible.tmtm.com/wiki/$1", "BluWiki": "http://www.bluwiki.org/go/$1", "Botwiki": "http://botwiki.sno.cc/wiki/$1", "Boxrec": "http://www.boxrec.com/media/index.php?$1", "BrickWiki": "http://brickwiki.org/index.php?title=$1", "BridgesWiki": "http://c2.com:8000/$1", "bugzilla": "https://bugzilla.wikimedia.org/show_bug.cgi?id=$1", "buzztard": "http://buzztard.org/index.php/$1", "Bytesmiths": "http://www.Bytesmiths.com/wiki/$1", "C2find": "http://c2.com/cgi/wiki?FindPage&value=$1", "Cache": "http://www.google.com/search?q=cache:$1", "CanyonWiki": "http://www.canyonwiki.com/wiki/index.php/$1", "CANWiki": "http://www.can-wiki.info/$1", "CellWiki": "http://cell.wikia.com/wiki/$1", "CentralWikia": "http://www.wikia.com/wiki/$1", "ChoralWiki": "http://www.cpdl.org/wiki/index.php/$1", "Ciscavate": "http://ciscavate.org/index.php/$1", "Citizendium": "http://en.citizendium.org/wiki/$1", "CKWiss": "http://ck-wissen.de/ckwiki/index.php?title=$1", "CNDbName": "http://cndb.com/actor.html?name=$1", "CNDbTitle": "http://cndb.com/movie.html?title=$1", "CoLab": "http://colab.info", "Comixpedia": "http://www.comixpedia.org/index.php?title=$1", "comcom": "http://comcom.wikimedia.org/wiki/$1", "CommunityScheme": "http://community.schemewiki.org/?c=s&key=$1", "comune": "http://rete.comuni-italiani.it/wiki/$1", "Consciousness": "http://teadvus.inspiral.org/index.php/$1", "CorpKnowPedia": "http://corpknowpedia.org/wiki/index.php/$1", "CrazyHacks": "http://www.crazy-hacks.org/wiki/index.php?title=$1", "CreaturesWiki": "http://creatures.wikia.com/wiki/$1", "DAwiki": "http://www.dienstag-abend.de/wiki/index.php/$1", "Dcc": "http://www.dccwiki.com/$1", "DCDatabase": "http://www.dcdatabaseproject.com/wiki/$1", "DCMA": "http://www.christian-morgenstern.de/dcma/$1", "DejaNews": "http://www.deja.com/=dnc/getdoc.xp?AN=$1", "Delicious": "http://del.icio.us/tag/$1", "Demokraatia": "http://wiki.demokraatia.ee/index.php/$1", "Devmo": "http://developer.mozilla.org/en/docs/$1", "dict": "http://www.dict.org/bin/Dict?Database=*&Form=Dict1&Strategy=*&Query=$1", "Disinfopedia": "http://www.sourcewatch.org/wiki.phtml?title=$1", "distributedproofreaders": "http://www.pgdp.net/wiki/$1", "distributedproofreadersca": "http://www.pgdpcanada.net/wiki/index.php/$1", "dmoz": "http://www.dmoz.org/$1", "dmozs": "http://www.dmoz.org/cgi-bin/search?search=$1", "DocBook": "http://wiki.docbook.org/topic/$1", "DOI": "http://dx.doi.org/$1", "doom_wiki": "http://doom.wikia.com/wiki/$1", "download": "http://download.wikimedia.org/$1", "DRAE": "http://buscon.rae.es/draeI/SrvltGUIBusUsual?LEMA=$1", "Dreamhost": "http://wiki.dreamhost.com/index.php/$1", "DrumCorpsWiki": "http://www.drumcorpswiki.com/index.php/$1", "DWJWiki": "http://www.suberic.net/cgi-bin/dwj/wiki.cgi?$1", "EcoReality": "http://www.EcoReality.org/wiki/$1", "EfnetCeeWiki": "http://purl.net/wiki/c/$1", "EfnetCppWiki": "http://purl.net/wiki/cpp/$1", "EfnetPythonWiki": "http://purl.net/wiki/python/$1", "EfnetXmlWiki": "http://purl.net/wiki/xml/$1", "ELibre": "http://enciclopedia.us.es/index.php/$1", "EmacsWiki": "http://www.emacswiki.org/cgi-bin/wiki.pl?$1", "EnergieWiki": "http://www.netzwerk-energieberater.de/wiki/index.php/$1", "EoKulturCentro": "http://esperanto.toulouse.free.fr/nova/wikini/wakka.php?wiki=$1", "Ethnologue": "http://www.ethnologue.com/show_language.asp?code=$1", "EvoWiki": "http://wiki.cotch.net/index.php/$1", "Exotica": "http://www.exotica.org.uk/wiki/$1", "FanimutationWiki": "http://wiki.animutationportal.com/index.php/$1", "FinalEmpire": "http://final-empire.sourceforge.net/cgi-bin/wiki.pl?$1", "FinalFantasy": "http://finalfantasy.wikia.com/wiki/$1", "Finnix": "http://www.finnix.org/$1", "FlickrUser": "http://www.flickr.com/people/$1", "FloralWIKI": "http://www.floralwiki.co.uk/wiki/$1", "FlyerWiki-de": "http://de.flyerwiki.net/index.php/$1", "Foldoc": "http://www.foldoc.org/$1", "ForthFreak": "http://wiki.forthfreak.net/index.cgi?$1", "FoxWiki": "http://fox.wikis.com/wc.dll?Wiki~$1", "FreeBio": "http://freebiology.org/wiki/$1", "FreeBSDman": "http://www.FreeBSD.org/cgi/man.cgi?apropos=1&query=$1", "FreeCultureWiki": "http://wiki.freeculture.org/index.php/$1", "Freedomdefined": "http://freedomdefined.org/$1", "FreeFeel": "http://freefeel.org/wiki/$1", "FreekiWiki": "http://wiki.freegeek.org/index.php/$1", "ganfyd": "http://ganfyd.org/index.php?title=$1", "GaussWiki": "http://gauss.ffii.org/$1", "Gentoo-Wiki": "http://gentoo-wiki.com/$1", "GenWiki": "http://wiki.genealogy.net/index.php/$1", "GlobalVoices": "http://cyber.law.harvard.edu/dyn/globalvoices/wiki/$1", "GlossarWiki": "http://glossar.hs-augsburg.de/$1", "GlossaryWiki": "http://glossary.hs-augsburg.de/$1", "Golem": "http://golem.linux.it/index.php/$1", "Google": "http://www.google.com/search?q=$1", "GoogleDefine": "http://www.google.com/search?q=define:$1", "GoogleGroups": "http://groups.google.com/groups?q=$1", "GotAMac": "http://www.got-a-mac.org/$1", "GreatLakesWiki": "http://greatlakeswiki.org/index.php/$1", "Guildwiki": "http://guildwars.wikia.com/wiki/$1", "gutenberg": "http://www.gutenberg.org/etext/$1", "gutenbergwiki": "http://www.gutenberg.org/wiki/$1", "H2Wiki": "http://halowiki.net/p/$1", "HammondWiki": "http://www.dairiki.org/HammondWiki/index.php3?$1", "heroeswiki": "http://heroeswiki.com/$1", "HerzKinderWiki": "http://www.herzkinderinfo.de/Mediawiki/index.php/$1", "HKMule": "http://www.hkmule.com/wiki/$1", "HolshamTraders": "http://www.holsham-traders.de/wiki/index.php/$1", "HRWiki": "http://www.hrwiki.org/index.php/$1", "HRFWiki": "http://fanstuff.hrwiki.org/index.php/$1", "HumanCell": "http://www.humancell.org/index.php/$1", "HupWiki": "http://wiki.hup.hu/index.php/$1", "IMDbName": "http://www.imdb.com/name/nm$1/", "IMDbTitle": "http://www.imdb.com/title/tt$1/", "IMDbCompany": "http://www.imdb.com/company/co$1/", "IMDbCharacter": "http://www.imdb.com/character/ch$1/", "Incubator": "http://incubator.wikimedia.org/wiki/$1", "infoAnarchy": "http://www.infoanarchy.org/en/$1", "Infosecpedia": "http://www.infosecpedia.org/pedia/index.php/$1", "Infosphere": "http://theinfosphere.org/$1", "irc": "irc://irc.freenode.net/$1", "Iuridictum": "http://iuridictum.pecina.cz/w/$1", "JamesHoward": "http://jameshoward.us/$1", "JavaNet": "http://wiki.java.net/bin/view/Main/$1", "Javapedia": "http://wiki.java.net/bin/view/Javapedia/$1", "JEFO": "http://esperanto-jeunes.org/wiki/$1", "JiniWiki": "http://www.cdegroot.com/cgi-bin/jini?$1", "JspWiki": "http://www.ecyrd.com/JSPWiki/Wiki.jsp?page=$1", "JSTOR": "http://www.jstor.org/journals/$1", "Kamelo": "http://kamelopedia.mormo.org/index.php/$1", "Karlsruhe": "http://ka.stadtwiki.net/$1", "KerimWiki": "http://wiki.oxus.net/$1", "KinoWiki": "http://kino.skripov.com/index.php/$1", "KmWiki": "http://kmwiki.wikispaces.com/$1", "KontuWiki": "http://kontu.merri.net/wiki/$1", "KoslarWiki": "http://wiki.koslar.de/index.php/$1", "Kpopwiki": "http://www.kpopwiki.com/$1", "LinguistList": "http://linguistlist.org/forms/langs/LLDescription.cfm?code=$1", "LISWiki": "http://liswiki.org/wiki/$1", "LiteratePrograms": "http://en.literateprograms.org/$1", "Livepedia": "http://www.livepedia.gr/index.php?title=$1", "Lojban": "http://www.lojban.org/tiki/tiki-index.php?page=$1", "Lostpedia": "http://lostpedia.wikia.com/wiki/$1", "LQWiki": "http://wiki.linuxquestions.org/wiki/$1", "LugKR": "http://lug-kr.sourceforge.net/cgi-bin/lugwiki.pl?$1", "Luxo": "http://toolserver.org/~luxo/contributions/contributions.php?user=$1", "lyricwiki": "http://www.lyricwiki.org/$1", "mail": "https://lists.wikimedia.org/mailman/listinfo/$1", "mailarchive": "http://lists.wikimedia.org/pipermail/$1", "Mariowiki": "http://www.mariowiki.com/$1", "MarvelDatabase": "http://www.marveldatabase.com/wiki/index.php/$1", "MeatBall": "http://www.usemod.com/cgi-bin/mb.pl?$1", "MemoryAlpha": "http://memory-alpha.org/en/wiki/$1", "MetaWiki": "http://sunir.org/apps/meta.pl?$1", "Mineralienatlas": "http://www.mineralienatlas.de/lexikon/index.php/$1", "MoinMoin": "http://moinmo.in/$1", "Monstropedia": "http://www.monstropedia.org/?title=$1", "MosaPedia": "http://mosapedia.de/wiki/index.php/$1", "MozCom": "http://mozilla.wikia.com/wiki/$1", "MozillaWiki": "http://wiki.mozilla.org/$1", "MozillaZineKB": "http://kb.mozillazine.org/$1", "MusicBrainz": "http://wiki.musicbrainz.org/$1", "MW": "http://www.mediawiki.org/wiki/$1", "MWOD": "http://www.merriam-webster.com/cgi-bin/dictionary?book=Dictionary&va=$1", "MWOT": "http://www.merriam-webster.com/cgi-bin/thesaurus?book=Thesaurus&va=$1", "NetVillage": "http://www.netbros.com/?$1", "NKcells": "http://www.nkcells.info/wiki/index.php/$1", "NoSmoke": "http://no-smok.net/nsmk/$1", "Nost": "http://nostalgia.wikipedia.org/wiki/$1", "OEIS": "http://www.research.att.com/~njas/sequences/$1", "OldWikisource": "http://wikisource.org/wiki/$1", "OLPC": "http://wiki.laptop.org/go/$1", "OneLook": "http://www.onelook.com/?ls=b&w=$1", "OpenFacts": "http://openfacts.berlios.de/index.phtml?title=$1", "Openstreetmap": "http://wiki.openstreetmap.org/wiki/$1", "OpenWetWare": "http://openwetware.org/wiki/$1", "OpenWiki": "http://openwiki.com/?$1", "Opera7Wiki": "http://operawiki.info/$1", "OrganicDesign": "http://www.organicdesign.co.nz/$1", "OrgPatterns": "http://www.bell-labs.com/cgi-user/OrgPatterns/OrgPatterns?$1", "OrthodoxWiki": "http://orthodoxwiki.org/$1", "OSI reference model": "http://wiki.tigma.ee/index.php/$1", "OTRS": "https://ticket.wikimedia.org/otrs/index.pl?Action=AgentTicketZoom&TicketID=$1", "OTRSwiki": "http://otrs-wiki.wikimedia.org/wiki/$1", "OurMedia": "http://www.socialtext.net/ourmedia/index.cgi?$1", "PaganWiki": "http://www.paganwiki.org/wiki/index.php?title=$1", "Panawiki": "http://wiki.alairelibre.net/wiki/$1", "PangalacticOrg": "http://www.pangalactic.org/Wiki/$1", "PerlConfWiki": "http://perl.conf.hu/index.php/$1", "PerlNet": "http://perl.net.au/wiki/$1", "PersonalTelco": "http://www.personaltelco.net/index.cgi/$1", "PHWiki": "http://wiki.pocketheaven.com/$1", "PhpWiki": "http://phpwiki.sourceforge.net/phpwiki/index.php?$1", "PlanetMath": "http://planetmath.org/?op=getobj&from=objects&id=$1", "PMEG": "http://www.bertilow.com/pmeg/$1.php", "PMWiki": "http://old.porplemontage.com/wiki/index.php/$1", "PurlNet": "http://purl.oclc.org/NET/$1", "pyrev": "http://svn.wikimedia.org/viewvc/pywikipedia?view=rev&revision=$1", "PythonInfo": "http://www.python.org/cgi-bin/moinmoin/$1", "PythonWiki": "http://www.pythonwiki.de/$1", "psycle": "http://psycle.sourceforge.net/wiki/$1", "qcwiki": "http://wiki.quantumchemistry.net/index.php/$1", "quality": "http://quality.wikimedia.org/wiki/$1", "Qwiki": "http://qwiki.caltech.edu/wiki/$1", "r3000": "http://prinsig.se/weekee/$1", "RakWiki": "http://rakwiki.no-ip.info/$1", "Raec": "http://www.raec.clacso.edu.ar:8080/raec/Members/raecpedia/$1", "rev": "http://www.mediawiki.org/wiki/Special:Code/MediaWiki/$1", "ReVo": "http://purl.org/NET/voko/revo/art/$1.html", "RFC": "http://tools.ietf.org/html/rfc$1", "RheinNeckar": "http://wiki.rhein-neckar.de/index.php/$1", "RoboWiki": "http://robowiki.net/?$1", "ReutersWiki": "http://glossary.reuters.com/index.php/$1", "RoWiki": "http://wiki.rennkuckuck.de/index.php/$1", "rtfm": "ftp://rtfm.mit.edu/pub/faqs/$1", "S23Wiki": "http://s23.org/wiki/$1", "Scholar": "http://scholar.google.com/scholar?q=$1", "SchoolsWP": "http://schools-wikipedia.org/wiki/$1", "Scores": "http://www.imslp.org/wiki/$1", "Scoutwiki": "http://en.scoutwiki.org/$1", "Scramble": "http://www.scramble.nl/wiki/index.php?title=$1", "SeaPig": "http://www.seapig.org/$1", "SeattleWiki": "http://seattlewiki.org/wiki/$1", "SeattleWireless": "http://seattlewireless.net/?$1", "SLWiki": "http://wiki.secondlife.com/wiki/$1", "SenseisLibrary": "http://senseis.xmp.net/?$1", "silcode": "http://www.sil.org/iso639-3/documentation.asp?id=$1", "Shakti": "http://cgi.algonet.se/htbin/cgiwrap/pgd/ShaktiWiki/$1", "Slashdot": "http://slashdot.org/article.pl?sid=$1", "SMikipedia": "http://www.smiki.de/$1", "SourceForge": "http://sourceforge.net/$1", "spcom": "http://spcom.wikimedia.org/wiki/$1", "species": "http://species.wikimedia.org/wiki/$1", "Squeak": "http://wiki.squeak.org/squeak/$1", "stable": "http://stable.toolserver.org/$1", "strategy" : "http://strategy.wikimedia.org/wiki/$1", "StrategyWiki": "http://strategywiki.org/wiki/$1", "sulutil": "http://toolserver.org/~vvv/sulutil.php?user=$1", "Susning": "http://www.susning.nu/$1", "Swtrain": "http://train.spottingworld.com/$1", "svn": "http://svn.wikimedia.org/viewvc/mediawiki/$1?view=log", "SVGWiki": "http://www.protocol7.com/svg-wiki/default.asp?$1", "SwinBrain": "http://mercury.it.swin.edu.au/swinbrain/index.php/$1", "SwingWiki": "http://www.swingwiki.org/$1", "TabWiki": "http://www.tabwiki.com/index.php/$1", "Takipedia": "http://www.takipedia.org/wiki/$1", "Tavi": "http://tavi.sourceforge.net/$1", "TclersWiki": "http://wiki.tcl.tk/$1", "Technorati": "http://www.technorati.com/search/$1", "TEJO": "http://www.tejo.org/vikio/$1", "TESOLTaiwan": "http://www.tesol-taiwan.org/wiki/index.php/$1", "Testwiki": "http://test.wikipedia.org/wiki/$1", "Thelemapedia": "http://www.thelemapedia.org/index.php/$1", "Theopedia": "http://www.theopedia.com/$1", "ThePPN": "http://wiki.theppn.org/$1", "ThinkWiki": "http://www.thinkwiki.org/wiki/$1", "TibiaWiki": "http://tibia.erig.net/$1", "ticket": "https://ticket.wikimedia.org/otrs/index.pl?Action=AgentTicketZoom&TicketNumber=$1", "TMBW": "http://tmbw.net/wiki/$1", "TmNet": "http://www.technomanifestos.net/?$1", "TMwiki": "http://www.EasyTopicMaps.com/?page=$1", "TokyoNights": "http://wiki.tokyo-nights.com/wiki/$1", "Tools": "http://toolserver.org/$1", "tswiki": "http://wiki.toolserver.org/view/$1", "translatewiki": "http://translatewiki.net/wiki/$1", "Trash!Italia": "http://trashware.linux.it/wiki/$1", "Turismo": "http://www.tejo.org/turismo/$1", "TVIV": "http://tviv.org/wiki/$1", "TVtropes": "http://www.tvtropes.org/pmwiki/pmwiki.php/Main/$1", "TWiki": "http://twiki.org/cgi-bin/view/$1", "TwistedWiki": "http://purl.net/wiki/twisted/$1", "TyvaWiki": "http://www.tyvawiki.org/wiki/$1", "Unreal": "http://wiki.beyondunreal.com/wiki/$1", "Urbandict": "http://www.urbandictionary.com/define.php?term=$1", "USEJ": "http://www.tejo.org/usej/$1", "UseMod": "http://www.usemod.com/cgi-bin/wiki.pl?$1", "ValueWiki": "http://www.valuewiki.com/w/$1", "Veropedia": "http://en.veropedia.com/a/$1", "Vinismo": "http://vinismo.com/en/$1", "VLOS": "http://www.thuvienkhoahoc.com/tusach/$1", "VKoL": "http://kol.coldfront.net/thekolwiki/index.php/$1", "VoIPinfo": "http://www.voip-info.org/wiki/view/$1", "WarpedView": "http://www.warpedview.com/mediawiki/index.php/$1", "WebDevWikiNL": "http://www.promo-it.nl/WebDevWiki/index.php?page=$1", "Webisodes": "http://www.webisodes.org/$1", "WebSeitzWiki": "http://webseitz.fluxent.com/wiki/$1", "wg": "http://wg.en.wikipedia.org/wiki/$1", "Wikianso": "http://www.ansorena.de/mediawiki/wiki/$1", "Wikible": "http://wikible.org/en/$1", "Wikichat": "http://www.wikichat.org/$1", "WikiChristian": "http://www.wikichristian.org/index.php?title=$1", "WikiF1": "http://www.wikif1.org/$1", "WikiFur": "http://en.wikifur.com/wiki/$1", "wikiHow": "http://www.wikihow.com/$1", "WikiIndex": "http://wikiindex.com/$1", "WikiLemon": "http://wiki.illemonati.com/$1", "Wikilivres": "http://wikilivres.info/wiki/$1", "WikiMac-de": "http://apfelwiki.de/wiki/Main/$1", "WikiMac-fr": "http://www.wikimac.org/index.php/$1", "Wikinfo": "http://www.wikinfo.org/index.php/$1", "Wikinurse": "http://wikinurse.org/media/index.php?title=$1", "Wikinvest": "http://www.wikinvest.com/$1", "Wikipaltz": "http://www.wikipaltz.com/wiki/$1", "Wikireason": "http://wikireason.net/wiki/$1", "Wikischool": "http://www.wikischool.de/wiki/$1", "wikisophia": "http://wikisophia.org/index.php?title=$1", "Wikispot": "http://wikispot.org/?action=gotowikipage&v=$1", "WikiTI": "http://wikiti.denglend.net/index.php?title=$1", "WikiTravel": "http://wikitravel.org/en/$1", "WikiTree": "http://wikitree.org/index.php?title=$1", "Wipipedia": "http://www.londonfetishscene.com/wipi/index.php/$1", "WLUG": "http://www.wlug.org.nz/$1", "wmau": "http://wikimedia.org.au/wiki/$1", "wmcz": "http://meta.wikimedia.org/wiki/Wikimedia_Czech_Republic/$1", "wmno": "http://no.wikimedia.org/wiki/$1", "wmrs": "http://rs.wikimedia.org/wiki/$1", "wmse": "http://se.wikimedia.org/wiki/$1", "wmuk": "http://uk.wikimedia.org/wiki/$1", "wm2005": "http://wikimania2005.wikimedia.org/wiki/$1", "wm2006": "http://wikimania2006.wikimedia.org/wiki/$1", "wm2007": "http://wikimania2007.wikimedia.org/wiki/$1", "wm2008": "http://wikimania2008.wikimedia.org/wiki/$1", "wm2009": "http://wikimania2009.wikimedia.org/wiki/$1", "wm2010": "http://wikimania2010.wikimedia.org/wiki/$1", "wmania": "http://wikimania.wikimedia.org/wiki/$1", "WMF": "http://wikimediafoundation.org/wiki/$1", "Wookieepedia": "http://starwars.wikia.com/wiki/$1", "World66": "http://www.world66.com/$1", "WoWWiki": "http://www.wowwiki.com/$1", "Wqy": "http://wqy.sourceforge.net/cgi-bin/index.cgi?$1", "WurmPedia": "http://www.wurmonline.com/wiki/index.php/$1", "WZNAN": "http://www.wikiznanie.ru/wiki/article/$1", "Xboxic": "http://wiki.xboxic.com/$1", "ZRHwiki": "http://www.zrhwiki.ch/wiki/$1", "ZUM": "http://wiki.zum.de/$1", "ZWiki": "http://www.zwiki.org/$1", "ZZZ Wiki": "http://wiki.zzz.ee/index.php/$1", } def fixStyle(text): pass def getdateformat(text): """ ISO DMY MDY """ return 'DMY' def fix(text="", page=None, verbose = True): if not page: page = wikipedia.Page(wikipedia.getSite(), 'Special:ParserTests') if not text: text=page.get() # ## Hacks # text = text.replace('http://www.news.bbc.co.uk', 'http://news.bbc.co.uk') # TODO: Fix accessyear/acessdate mismatch # Peer Reviewer script had for sometime time convert URL into the following bad form text = re.sub(r'\{\{[Cc]ite web\s*\|\s*url\s*=\s*http://(?P[^{|}]+)\s*\|\s*title\s*=\s*(http://)?(?P=title)\s*(<!--[^<>]+-->)?\s*((\|format=(PDF|DOC))|(\|\s*accessdate *= *[^{|}]+))*\}\}', r'[http://\g<title>]', text) # a second time since we seem to hittings limits text = re.sub(r'\{\{[Cc]ite web\s*\|url=(http://[^{|}]+)\s*\|title=([^{=}]+<!--[^<=>/]+-->)(\|format=(PDF|DOC))?\}\}', r'[\1 \2]', text) # Following the collapse of MiB preference PDFbot converts to the new format when saving text = re.sub(r'\{\{(PDF(?:link)?\|[^{|}]+\|[\d\.]+) \[\[[^|]+\|([KMG])iB\]\]<!--[^<>]+-->\}\}', r'{{\1 \2B}}', text) # EN MOS -- Format Retrieved XXXX # FIXME ]. [[Encyklopedia Interia]]. Last accessed 20 January 2007.</ref> text = re.sub(r'((?<=[]["\'])|[.,(;?])(?P<qm>[)"\']|) *((URL *)?([Ll]ink *)?([Ll]ast *)?([Aa]ccess|reach)e?d?( +on| +online)?|[Rr]etrie?ved?) +(?P<date>(((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w+|\[\[|20\d\d|\d|0\d|1\d|2\d|3[01]|\]\])[,.)\s\-]*?)+)[,.)\s]*</ref>', r'.\g<qm> Retrieved \g<date>.</ref>', text) # deprecated date linking, remove in citations text = re.sub(r'\[\[(\d+ (?:January|February|March|April|May|June|July|August|September|October|November|December))\]\],? \[\[(\d{4})\]\](?=[^<>]*</ref>)', r'\1 \2', text) text = re.sub(r'\[\[((?:January|February|March|April|May|June|July|August|September|October|November|December) \d+)\]\],? \[\[(\d{4})\]\](?=[^<>]*</ref>)', r'\1, \2', text) # ## Comments # # Update {{NoMoreLinks}} text = re.sub(r'<!--=+\(\{\{No ?More ?Links\}\}\)=+([^<>]+|-->(\n*<!--.*?-->\n)+<!--)=+\(\{\{No ?More ?Links\}\}\)=+-->', '{{subst:NoMoreLinks}}', text) # Remove comment from the instroduction of footnotes text = re.sub(r"\n?<!--[^<>]*[Ss]ee +http://en.wikipedia.org/wiki/Wikipedia:Footnotes +[^<>]+generate([^<>]|<(?=/?ref)[^<>]*>)+-->", '', text) # Remove outdated comments text = re.sub(r'\n?<!--\s*Categories\s*-->', '', text) #### Some bad script ###text = re.sub(r'\{\{cite web\|url=([\d.\-]+)\|title=[^{|}]*\|last=([^{|}]+)\|first=([^{|}]+)\}\}', r'\2 \3. \1', text) # Now that we got all the stuff that deals with comments out the way we can hide them to prevent mismatching text = hideText(text) if page.site().sitename() == 'wikipedia:en' and page.namespace() in [0, 2, 6]: wikipedia.output("Apply English Wikipedia fixes") text = formatEnglishWikipediaTemplate(page, text) # ## HTML ## # # <b> & <i> to ''' & '' text = re.sub(r"(?<!')<b>([^{|}<>\n']*?)</b>(?!')", r"'''\1'''", text) text = re.sub(r"(?<!')<i>([^{|}<>\n']*?)</i>(?!')", r"''\1''", text) # Standardize tables text = re.sub(r'\n\|-+(?=[^{|}\n]*\n)', r'\n|-', text) text = re.sub(r'\n\|-(?=\w)', r'\n|- ', text) text = re.sub(r'\n\|-[^{}|<>\n]*(?=\n\|-)', r'', text) text = re.sub(r'(\n\{\|[^][{}|<>\n]*)\n+(?=[|!][^+\-{}\n]+\n)', r'\1\n|-\n', text) text = re.sub(r'\n\|-[^{}|<>\n]*\n*(?=\n\|\})', r'', text) text = fixHTML(page,text) saved = text # saved state # Merge styles in a table for property in ['text-align', 'vertical-align', 'font-size', 'font-family', 'color', 'background','background-color']: text = re.sub(r''' \|-([^\n{|}[\]]*?)( * \|[^{|}[\]]*style="[^"]*('''+property+r''':[^;"]+;)[^"]*"[^{|}[\]]*\|[^|\n]*?((?:\n\|(?!-)|\|\|)[^{|}[\]]*style="[^"]*\3[^"]*"[^{|}[\]]*\|[^|\n]*)+)(?= \|[-}])''', r'\n|-\1 style="\3" \2', text) p = re.compile(r'''( \|- style="[^"]*?('''+property+r''':[^";]+;)[^"]*?"[^\n{|}[\]]*( \|(?!-)(?:[^[\]{|}]*\|[^\n]*?))*? \|(?!-)[^{|}[\]]*style="[^"]*)\2 *(?=[^"]*"[^[\]{|}]*\|[^\n])''') while p.search(text): text = p.sub(r'\1', text) if saved != text: text = fixHTML(page,text) # ## Hyperlinking ## # # Remove url junk (tracking, referrers, client info) for i in range(0,9): text = re.sub(r'(http://[^][<>\s"|])(&client=firefox-a|<=)(?=[][<>\s"|&])', r'\1', text) text = text.replace('[{{SERVER}}{{localurl:', '[{{fullurl:') # Use magic words instead # text = re.sub(r'\[http://en.wikipedia.org/w/index.php\?title=([^][<>"\s&=?]+)&?([^][<>"\s]*)', r'[{{fullurl:\1|\2}}', text) # convert (see http://...) into <http://...>, which is better handled by software text = re.sub(r'(?i)[(](?:see|) *(http://[^][<>"\s(|)]+[\w=/&])\s?[)]', r'<\1>', text) # From fixes.py # external link in double brackets text = re.sub(r'\[\[(?P<url>https?://[^\]\n]+?)\]\]', r'[\g<url>]', text) # external link starting with double bracket text = re.sub(r'\[\[(?P<url>https?://.+?)\]', r'[\g<url>]', text) # pipe in url (unlikely to go wrong) text = re.sub(r'\[(?P<url>https?://[^][<>\s"\|;?]+?\.(aspx?|doc|f?cgi|html?|jsp|pdf|php|pl|ppt|rtf|txt|xml)) *\| *(?P<label>[^\|\]]+?)\]', r'[\g<url> \g<label>]', text) # Use of Image: #if '[[Image:' in text: # text = re.sub(r'(?i)\[\[(:?)File:([^][{|}]+\.(djvu|jpe?g|png|gif|svg|tiff))(?=\||\]\])', r'[[\1Image:\2', text) text = re.sub(r'(?i)\[\[(:?)Image:([^][{|}]+\.(pdf|midi?|ogg|ogv|xcf))(?=\||\]\])', r'[[\1File:\2', text) # Commons fixes for URLs # TODO: remove domain name titles [http://example.com/aboutus.pdf example.com] # | url= http://www.statcan.ca/english/sdds/instrument/3901_Q2_V2_E.pdf] (fx by removing the invalid []) text = re.sub(ur'(http:/* *){2,}(?=[a-z0-9:.\-]+/)', 'http://', text) # Silently correct http://http:/ text = re.sub(ur"(\[\w+://[^][<>\"\s]+?)''", r"\1 ''", text) # corrects [http://''title''] (nospaces) -> [http:// ''title''] text = re.sub(ur'(?u)\[\n*(\w+://[^][<>"\s]+ *(?:(?<= )[^\n\]<>]*?|))\n([^[\]<>{}\n=@/]*?) *\n*\]', ur'[\1 \2]', text) # Fix some links which were broken with a line break text = re.sub(ur'(?u)\[(\w+://[^][<>"\s]+) +([Cc]lick here|[Hh]ere|\W|→|[ -/;-@]) *\]', ur'\2 [\1]', text) # remove unhelpful titles for screen readers # Embedded images with bad anchors text = re.sub(r'(?i)(\[\[(?:File|Image):[^][<>{|}]+)#(|filehistory|filelinks|file)(?=[\]|])', r'\1', text) text = ext2intLinks(page, text) text = simplifyLinks(page, text) ## References ## # This is need because of <gallery>Image1.jpg|caption<ref>this is hidden</ref></gallery> text = fixReferences(page, text) text = showText(text) if noreferences and page.namespace() != 10 and page.title() != 'Special:ParserTests': norefbot = noreferences.NoReferencesBot(None) if norefbot.lacksReferences(text, verbose=False): text = norefbot.addReferences(text) return text def formatEnglishWikipediaTemplate(page, text): ''' hello ''' # merge all variant of cite web # make into {'dictname':(t1, t2, t3),} text = re.sub(r'(?i)\{\{\s*(cite[_ \-]*(url|web|website)|Web[_ \-]*(citation|cite|reference|reference[_ ]4))(?=\s*\|)', '{{cite web', text) ## Unlink # Remove formatting on certian parameters text = re.sub(r"(\|\s*(?:agency|author|first|format|language|last|location|month|publisher|work|year)\s*=\s*)(''|'''|''''')((?:\[\[[^][|]+|\[\[|)[][\w\s,.~!`\"]+)(''+)(?=\s*\|[\w\s]+=|\s*\}\})", r'\1\3', text) # Unlink well known publisher parameters (add work=?) text = re.sub(r'(?i)(\|\s*(?:publisher|newpaper)\s*=\s*)\[\[(?:[Tt]he )?('+('|'.join(commonPublishers))+')\]\]', r'\1\2', text) # Unlink PDF in format parameters text = re.sub(r'(?i)(\|\s*format\s*=\s*)\[\[(adobe|portable|document|file|format|pdf|\.|\s|\(|\)|\|)+\]\]', r'\1PDF', text) text = re.sub(r'(?i)(\|\s*format\s*=\s*)(\s*\.?(adobe|portable|document|file|format|pdf|\(|\)))+?(\s*[|}])', r'\1PDF\4', text) # No |format=HTML says {{cite web/doc}} text = re.sub(r'(?i)(\{\{cite[^{}]+)\|\s*format\s*=\s*(\[\[[^][|]+\||\[\[|)(\]\]| |html?|world|wide|web)+\s*(?=\||\}\})', r'\1', text) ## Fix parameters # Fix accessdate tags [[WP:AWB/FR#Fix accessdate tags]] text = re.sub(r'(\|\s*)a[ces]{3,8}date(\s*=\s*)(?=[^{|}]*20\d\d|\}\})', r'\1accessdate\2', text) text = re.sub(r'accessdate(\s*=\s*)\[*(200\d)[/_\-](\d{2})[/_\-](\d{2})\]*', r'accessdate\1\2-\3-\4', text) text = re.sub(r'(\|\s*)a[cs]*es*mou*nthday(\s*=\s*)', r'\1accessmonthday\2', text) text = re.sub(r'(\|\s*)a[cs]*es*daymou*nth(\s*=\s*)', r'\1accessdaymonth\2', text) text = re.sub(r'(\|\s*)accessdate(\s*=\s*[0-3]?[0-9] +(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*)([^][<>}{]*accessyear[\s=]+20\d\d)', r'\1accessdaymonth\2\3', text) text = re.sub(r'(\|\s*)accessdate(\s*=\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w* +[0-3]?[0-9])([^][<>}{]*accessyear[\s=]+20\d\d)', r'\1accessmonthday\2\3', text) text = re.sub(r'(\|\s*)accessdaymonth(\s*=\s*)\s*([^{|}<>]+?)\s*(\|[^][<>}{]*accessyear[\s=]+)(20\d\d)', r'\1accessdate\2\3 \5', text) text = re.sub(r'(\|\s*)accessmonthday(\s*=\s*)\s*([^{|}<>]+?)\s*(\|[^][<>}{]*accessyear[\s=]+)(20\d\d)', r'\1accessdate\2\3, \5', text) # Fix improper dates text = re.sub(r'(?i)(\{\{cit[ea][^{}]+\|\s*date\s*=\s*\d{2}[/\-.]\d{2}[/\-.])([5-9]\d)(?=\s*[|}])', r'\g<1>19\g<3>', text) text = re.sub(r'(?i)(\{\{cit[ea][^{}]+\|\s*date\s*=\s*)(0[1-9]|1[012])[/\-.](1[3-9]|2\d|3[01])[/\-.](19\d\d|20\d\d)(?=\s*[|}])', r'\1\4-\2-\3', text) text = re.sub(r'(?i)(\{\{cit[ea][^{}]+\|\s*date\s*=\s*)(1[3-9]|2\d|3[01])[/\-.](0[1-9]|1[012])[/\-.](19\d\d|20\d\d)(?=\s*[|}])', r'\1\4-\3-\2', text) # Fix URLS lacking http:// text = re.sub(r'(\|\s*url\s*=\s*)([0-9a-z.\-]+\.[a-z]{2,4}/[^][{|}:\s"]\s*[|}])', r'\1http://\2', text) # Fix {{citation|title=[url title]}} text = re.sub(r'(?i)(\{\{cit[ea][^{}]*?)(\s*\|\s*)(?:url|title)(\s*=\s*)\[([^][<>\s"]*) +([^]\n]+)\](?=[|}])', r'\1\2url\3\4\2title\3\5', text) # Associated Press is usually the agency, not the work or publisher text = re.sub(r'(?i)\{\{\s*[Cc]ite\s*(?:web|news|newpaper|article)([^{}]+?)(\s*\|\s*)(?:publisher|work|author)(\s*=\s*)(\[\[[^[\]|]+\||\[\[|)(?P<agency>%s)(\]\])?(?=\s*\|[^][{}]+=|\s*\}\})' % '|'.join(agencies), r'{{cite news\1\2agency\3Associated Press', text) text = re.sub(r'(?i)(\{\{[^{}]+\|\s*url\s*=[^][{|}]+\.ap\.org/[^{}]+\|\s*)agency(\s*=\s*)Associated Press', r'\1work\2Associated Press', text) text = re.sub(r'(?i)(\{\{[^{}]+\|\s*)agency(\s*=\s*)Associated Press([^{}]+\|\s*url\s*=[^][{|}]+\.ap\.org/)', r'\1work\2Associated Press\3', text) # Fix pages=1 and page=20-44 and page=p. 22 , corner p. 23 section 5 # text = re.sub(r'(\{\{\s*(?:[Cc]ite (journal|news))[^{}]*\| *pages?\s*=\s*)(p[pg]?[. ]|pages?\b) *(?=[\d\-]+\s*[|}])', r'\1', text) text = re.sub(r'(?iu)(\{\{\s*(?:cite (?:journal|news|book|web)|citation)[^{}]*?\|\s*)pages(?=\s*=\s*(p|pp|pg|page|pages|)\b[.:]?\s*\d+\s*(\||\}\}))', r'\1page', text) text = re.sub(r'(?iu)(\{\{\s*(?:cite (?:journal|news|book|web)|citation)[^{}]*?\|\s*)page(?=\s*=\s*(p|pp|pg|page|pages|)\b[.:]?\s*\d+\s*[\-]\s*\d+\s*(\||\}\}))', r'\1pages', text) # \n in title causes links to break for m in re.finditer(r'\|\s*(?:title)\s*=\s*([^{|}]*?)\s*\|',text): text = text.replace(m.group(), m.group().replace(m.group(1), m.group(1).replace('\n', ' ').replace('\r', ' '))) # Change infoboxes from trailing pipes (likely stems from {{qif}} days) p = re.compile(r'(\{\{[\w\s_]*[Ii]nfobox([^{}]*?\{\{[^{}]+\}\})*[^{}]*?[^{|}](= )?) *\| *\n ?(?=[\s\w]+=)', re.U) while p.search(text): text = p.sub(r'\1\n| ', text) text = text.replace('|\n}}', '\n}}') # Fix web.archive.org links # TODO |url= web.archive -> url+archiveurl # Note: correct web.archive.org/2008/en.wikipedia.org/page format text = re.sub(ur'(\{\{(?:[Cc]ite web|[Cc]ite news|[Cc]ite|[Cc]itation)[^{}]*?)(\|\s*)url(\s*=\s*)(?P<archiveurl>http://web.archive.org/web/(?P<y>\d{4})(?P<m>\d{2})(?P<d>\d{2})\d{6}/(?P<url>http://[^[\]<>"\s]+?))(\s*)(?=\||\}\})', ur'\1\2url\3\g<url>\9\2archiveurl\3\g<archiveurl>\9\2archivedate\3\g<y>-\g<m>-\g<d>\9', text) # Proper Capitilize ALL UPPERCASE names and titles for m in re.finditer(r'(\|\s*(?:title|last|first|author)\s*=\s)([A-Z"\'\s.:;\-+0-9]{10,})(?=[{|}])', text): s = m.group(2) s = s.capitalize() text=text.replace(m.group(), m.group(1)+s) # basic implemnt of tpl_cite for (find_template, replace_template, condition) in tpl_cite: text = re.sub(ur'(\{\{\s*)(?:%s)((?=\s*\|)[^{}]*(%s)[^{}]*\}\})' % (find_template, condition), r'\g<1>%s\g<2>' % replace_template, text) return text def fixHTML(page, text): ''' ''' # Remove old {{prettytable}} header row formatting text = re.sub(r'(?i)(\n\{\| class="wikitable[^\n]+\n\|-[^\n]*)(bgcolor\W+CCC+|background\W+ccc+)(?=\W+\n!)', r'\1', text) # <br/> has no visible effect on output next to a block level item text = re.sub(r'(\n([^<\n]|<(?!br[^>]*>))+\w+[^\w\s<>]*)<br[ /]*>(?=\n[*#:;]|\n?<div|\n?<blockquote)', r'\1', text) # Fix br text = re.sub(r'(?i)(<br[^</>]*>)\n?</br>', r'\1', text) text = re.sub(r'(?i)<[/]?br([^{/}<>]*?)>', r'<br\1 />', text) # Arrg! people are using this is templated tables as a way to visually align items! See [[Battle of Stalingrad]] # text = re.sub(r'(<br[\s/]*>|\n *\n *){4,}', r'\n{{clear}}\n', text) text = re.sub(r'(?i)<br\s\S*clear\S*(all|both)\S*[\s/]*>', r'{{-}}', text) text = re.sub(r'<br\s\S*clear\S*(left|right)\S*[\s/]*>', r'{{clear\1}}', text) # Use class="center" instead of <center> text = re.sub(r'(?i)<center\b([^<>]*)>((?:[^<]|<(?!/?\s*center\s*>))*)</center>', r'<div class="center"\1>\2</div>', text) # combine font tags text = re.sub(r'(?i)(<font\b[^<>]*)> *\n?<font\b([^<>]*>)((?:[^<]|<(?!/?font))*?</font> *\n?)</font>', r'\1\2\3', text) # text = re.sub(r'(?i)<font ([^<>]*)>\[\[([^[\]{|}]+)\|([^[\]\n]*?)\]\]</font>', r'[[\2|<font \1>\3</font>]]', text) #TODO look for single character entiys such as ; \ in markup, but ignore / text = re.sub(r'(<(?P<tag>\w+)(?= +)|\n\{\||(?<=\n)\|-|(?P<cell>\n[!|]|!!|\|\|))(?P<attr>[^<>[\]{|}\n]+(?(tag)(?=>)|(?(cell)(?=[!|][^!|])|(?=\n))))', fixAttributes, text) # Convert simple <font> to <span> # NOTE: <font>[[link|text]]</font> transforms to [[link|<font>text</font>]] by tidy text = re.sub(r'<font(( +style="[^"]+")+)>(?!\[\[)((?:[^<]|<(?!/?font))*?)(?<!\]\])</font>', r'<span\1>\3</span>', text) removedTags = {} for tag in re.findall(r'(?<=<)\w+(?=[^<>]*>)', text): # Deprecated and removed elements if tag in ( # Fonts style elements "tt", "big", "small", "strike", "s", "u", # Font modifier "font", "basefont", # Misc "center", "dir"): removedTags[tag] = removedTags.get(tag, 0) + 1 if removedTags: wikipedia.output("\03{lightred}WARNING\03{default} : The following tags have been removed in the HTML 5 specifcation: %s" % ', '.join(('<%s> (%dx)' % t for t in removedTags.iteritems()) )) return text def fixAttributes(node): tag = node.group('tag') attr = node.group('attr') if tag: tag = tag.lower() elif '{|' in node.group(1): tag = "table" elif '|-' in node.group(1): tag = "tr" if tag not in htmltags + (None, ): return node.group() # HACKS attr = re.sub(r'border="2" cellpadding="4" cellspacing="0" style="margin: *1em 1em 1em 0; background: *#f9f9f9; border: *1px #aaa+ solid; *border-collapse: *collapse(; *font-size: *[89]\d%)?', r'class="wikitable" style="', attr) # un-subst: {{prettytable}} and it dirvatives attr = re.sub(r'(?i)([^<>\n]*)border\W+2\W+cellpadding\W+4\W+cellspacing\W+0"?', r' class="wikitable" \1', attr) # p = re.compile(r'(class="wikitable[^<>\n]+ style="[^<>"\n]*?)(margin\W+1em\W+|1em\W+1em\W+0\W+|background\W+f9f9f9\W+|border\W+1px\W+#aa+a\W+solid\W+|border-collapse\W+collapse\W+|font-size\W+(100%|95%|1em)\W+)+(?=[^<>"\n]*")', re.I) # while p.search(text): # text = p.sub(r'\1', text) # WHERE DID I GET THIS!?!: ([^][{}<>|="\'\s]*[0-9a-zA-Z%._]+[^][{}<>|="\'\s]*) def quoteAttrib(m): # r' \g<attribute>="\g<value>"' return ' %s="%s"'%(m.group('attribute').lower(), m.group('value').strip()) # Quote attributes attr = re.sub(r"""(?uix)[ ]* \b(?P<attribute>\w{2,}) [ ]*=[ ]* ["']?(?P<value> (?<=")[^"]*?(?=")| (?<=')[^']*?(?=')| [^<=>"' [\]{|}]+(?=[<> ]|$) )["']?""", quoteAttrib, attr) # Deprecated classes attr = attr.replace(' class="prettytable', ' class="wikitable') # Repair broken HTML attr = re.sub(r'(?i) bgcolor="([A-Fa-f0-9]{6})"', r' bgcolor="#\1"', attr) # add hash to colors attr = re.sub(r'(?i) colspan="1"', r'', attr) attr = re.sub(r'(?i) rowspan="1"', r'', attr) # # move class= to the front # attr = re.sub(r'^(\s*)( [^][{|}<>]+)?( class="[^"]+"(?=\s|\Z))', r'\1\3\2', attr) if tag == 'table': # TODO move me # Tables attr = re.sub(r'(?i) align="(left|right)"', r' style="float:\1;" ', attr) attr = re.sub(r'(?i) align="center"', r' style="margin:auto;" ', attr) attr = re.sub(r'(?i) align="(\w+)"', '', attr) elif tag == 'div': attr = re.sub(r'(?i) align="(left|right)"', r' style="float:\1;"', attr) attr = re.sub(r'(?i) align="center"', r' class="center"', attr) if tag == 'table': attr = re.sub(r'(col|row)span=("1"|1)(?=\D)', r'', attr) #attr = attr.replace('cellspacing="0"', 'style="border-collapse:collapse; "') if 'border=' not in attr: # See [[MediaWiki talk:Common.css# Wikitable borders without CSS]] attr = re.sub(r'class="wikitable([^"\'{|}]*)"( *border="?1"?)*', r'class="wikitable\1" border="1"', attr) if re.search('float: *right', attr) and 'toccolours' in attr and node.start() < 400: # floats right, and near the top, gotta be a infobox attr = re.sub(r'class="toc(colours|)', r'class="infobox', attr) attr = re.sub(r'float: *right;|margin[^:;="]*:[^:;="]+|border="1"', r'', attr) # border-collapse is not exactly the same but it's close enough #attr = re.sub(r' cellspacing="0"', r' style="border-collapse:collapse;"', attr) if 'class="wikitable' in attr: attr = re.sub(r'(?i)(border:)( 1px| #aaa+| solid)+',r'\1', attr) attr = re.sub(r'(?i) border="?([0-9])"?', r'', attr) attr = re.sub(r'(?i) cellspacing="?([0])"?', r'', attr) attr = re.sub(r'(?i) cellpadding="?([2-4])"?', r'', attr) attr = re.sub(r'(?i)margin: ?1em 1em 1em 0', r'', attr) attr = re.sub(r'(?i)background: ?#f9f9f9', r'', attr) attr = re.sub(r'(?i)border-collapse: ?collapse', r'', attr) attr = re.sub(r'font-size: ?(100%|1em)', r'', attr) #if # avoid float: position: etc.. #attr = re.sub(r'font-size: ?\.?9\d(%|em)', r'', attr) # replace with CSS attr = re.sub(r'(?i) align="(left|center|right|justify)"', r' style="text-align:\1;"', attr) attr = re.sub(r'(?i) bgcolor="([^"]+?)"', r' style="background-color:\1;"', attr) #attr = re.sub(r'(?i) border="?([1-9])"?', r' style="border:\1px;"', attr) attr = re.sub(r'(?i) color="([^"]+?)"',r' style="color:\1;"', attr) attr = re.sub(r'(?i) clear="(left|right)"', r' style="clear:\1;"', attr) attr = re.sub(r'(?i) clear=" *all *"', r' style="clear:both;"', attr) attr = re.sub(r'(?i) face="([^"]+?)"', r' style="font-family:\1;"', attr) attr = re.sub(r'(?i) height="([^"]+?)"', r' style="height:\1;"', attr) attr = re.sub(r'(?i) nowrap(="(nowrap|yes|true)"|(?= )|$)', r' style="white-space:nowrap;"', attr) attr = re.sub(r'(?i) size="(\d+(em|%|px|pt))"', r' style="font-size:\1;"', attr) attr = re.sub(r'(?i) valign="([^"]+?)"', r' style="vertical-align:\1;"', attr) attr = re.sub(r'(?i) width="([^"]+?)"', r' style="width:\1;"', attr) # font size="#" render browser dependent, W3C leaves it open fontSizeConvert = {'1':'0.8em','2':'1em','3':'1.2em','4':'1.4em','5':'1.9em','6':'2.4em','7':'3.7em', '-4':'50%','-3':'60%','-2':'70%','-1':'80%','0':'100%', '+1':'120%','+2':'140%','+3':'160%','+4':'180%','+5':'200%','+6':'250%','+7':'300%',} for n in re.finditer(r' size="([1-7]|[+-][0-6])"', attr): attr = attr.replace(n.group(), r' style="font-size:%s;"'%fontSizeConvert[n.group(1)]) # merge style attributes together stylemerge = re.compile(r' style="([^"{|}\n]+?);* *"([^][!<>{|}\n]*?) style="([^"{|}\n]+)"') while stylemerge.search(attr): attr = stylemerge.sub(r'\2 style="\1; \3"', attr) # Fix up style parameters for styleMatch in re.finditer(r' style="([^[\]{|}\n]*?)"', attr): styleText = fixCSS(styleMatch.group(1)) attr = attr.replace(styleMatch.group(), styleText and ' style="%s"'%styleText or '') if '=' in styleText: wikipedia.output("\03{lightyellow}WARNING\03{default} : U+003D EQUALS SIGN (=) character found in style attribute") # Remove all non approved attributes for m in re.finditer(r'(?<= )(\w+)(="[^"]+"| +(?=\w)| *$| *>)', attr): if m.group(1).lower() not in htmlattrs and tag:# HACK remove when proper table support is in wikipedia.output("\03{lightred}REMOVED\03{default} : Invalid attribute %s" % (m.group(),)) attr = attr.replace(m.group(), '') elif m.group(2) == '=""': wikipedia.output("Emptry attribute") else: attr = attr.replace(m.group(), m.group(1).lower() + m.group(2)) # Alert user about deprecated html attributes # FIXME this should be split up into General, Table, Font # TODO add border= if m.group(1).lower() in "align|alink|background|bgcolor|border|cellspacing|cellpadding|clear|compact|color|face|height|hspace|link|noshade|nowrap|size|start|text|type|value|valign|vlink|width|vspace".split('|'): wikipedia.output("\03{lightred}DEPRECATED\03{default} : %s attribute (in %s) "%(m.group(), tag or "Table")) wikipedia.output(node.group().strip()) # put back in if re.sub(r'[ ;"]', '', node.group('attr').lower()) != re.sub(r'[ ;"]', '', attr.lower()) and len(attr) < len(node.group('attr')) * 2: return ''.join((node.group(1).lower(), attr.rstrip() )) else: return node.group() def fixCSS(styleText): #TODO # add filter for value and dictionary units # Stylistics changes styleText += ';' # add then remove styleText = re.sub(r' *: *', ':', styleText) styleText = re.sub(r' *(; *)+', '; ', styleText) # Remove "float; ..." and "float:;" styleText = re.sub(r'(\A *|;)([^;:=]*:? *;)+', r'\1', styleText) styleText = re.sub(r'[\w\-\s]:; ', '', styleText) styleText = re.sub(r'(background|color):([a-fA-F0-9]{3,6})', r'\1:#\2', styleText) if styleText.count('background') == 1: styleText = styleText.replace('background-color:', 'background:') # Background:none is shorter than background-color:transparent, but resets image related properties # We also assume that people will not set anything else since background-image: is filtered out # See: [[User:Chris Chittleborough/CSS-notes]] styleText = re.sub(r'background:[^:;]*transparent[^:;]*;', r'background:none;', styleText) # Assumed units styleText = re.sub(r'(width|height):(\d{2,});', r'\1:\2px;', styleText) styleText = re.sub(r'((?:background|border|border|color)(?:-color)?):([a-fA-F0-9]{3,6})(?=[ ;])', r'\1:#\2', styleText) # Fix units styleText = re.sub(r'\b(width|height|border|margin|padding):(\d{2,}|[1-9])(?=[; ])', r'\1:\2px;', styleText) styleText = re.sub(r'(?<=[ :]0)(em|%|px|pt)(?=[ ;])', "", styleText) # IE color compatiblity styleText = re.sub(r'(?i)\bgrey\b', r'gray', styleText) styleText = re.sub(r'(?i)(dark|dim|light|lightslate|slate)gr[ae]y', r'\1grey', styleText) # Shorten CSS color values for m in re.finditer(r'#(?:[0-9a-fA-F]{6}|[0-9a-fA-F]{3})(?=[ ;!])', styleText): if re.search(r'(?i)#(00|11|22|33|44|55|66|77|99|aa|bb|cc|dd|ee|ff){3}', m.group().lower() ): styleText = styleText.replace(m.group(), re.sub(r'(?ui)#([0-9a-f])[0-9a-f]([0-9a-f])[0-9a-f]([0-9a-f])[0-9a-f]', r'#\1\2\3', m.group().lower() )) elif m.group().upper() in namedColors: styleText = styleText.replace(m.group(), namedColors[m.group().upper()]) else: styleText = styleText.replace(m.group(), m.group().lower()) # use mirroring styleText = re.sub(r'(margin|padding):(?P<v>-?[\.0-9]+[a-zA-z]+|0)( (?P=v))+;', r'\1:\2;', styleText) styleText = re.sub(r'(margin|padding):(-?[\.0-9]+[a-zA-z]+|0) (-?[\.0-9]+[a-zA-z]+|0) \2 \3;', r'\1:\2 \3;', styleText) styleText = re.sub(r'(margin|padding):(-?[\.0-9]+[a-zA-z]+|0) (-?[\.0-9]+[a-zA-z]+|0) (-?[\.0-9]+[a-zA-z]+|0) \3;', r'\1:\2 \3 \4;', styleText) return styleText.strip() def ext2intLinks(page, text): text = re.sub(r'\[http://upload.wikimedia.org/wikipedia/(?:commons|%s)/[0-9A-Fa-f]/[0-9A-Fa-f]{2}/([^[\]<>\s?]+) *((?<= )[^\n\]]+)\]' % (page.site().language()), r'[[Media:\1|\2]]', text) text = re.sub(r'\[http://upload.wikimedia.org/wikipedia/(?:commons|%s)/[0-9A-Fa-f]/[0-9A-Fa-f]{2}/([^[\]<>\s?]+)\]' % (page.site().language()), r'<ref>[[Media:\1]]</ref>', text) text = re.sub(r'\[http://(www\.toolserver\.org|toolserver\.org|tools\.wikimedia\.org|tools\.wikimedia\.de)/([^][<>"\s;?]*)\?? ([^]\n]+)\]', r'[[tools:\2|\3]]', text) if page.namespace() == 0: # [[WP:SELF]] states that we shouldn't cross link from the main namespace text = re.sub(r'''(?ix)\[http://([a-z]{3}(?:-[a-z]+)*)\.(?: (wikt)ionary| wiki(n)ews| wiki(b)ooks| wiki(q)uote| wiki(s)ource| wiki(v)ersity)\.(?:com|net|org)/wiki/ (?![_ :]*(?:Talk|Help|User|Wikipedia|Wikinews|Wikibooks|wikiquote|wikisource|wikiversity|Portal|MediaWiki)(?:[ _]talk)?:) ([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\2\3\4\5\6\7:\1:\8|\9]]', text) text = re.sub(r'''(?ix)\[http://(meta|commons|incubator|quality) \.wikimedia\.(?:com|net|org)/wiki/ (?![_:]*(?:Talk|Help|User|Meta|commons|incubator|quality|Portal|MediaWiki)(?:_talk)*:) ([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\1:\2|\3]]', text) else: text = re.sub(r'''(?ix)\[http://([a-z]{3}(?:-[a-z]+)*)\.(?: (wikt)ionary| wiki(n)ews| wiki(b)ooks| wiki(q)uote| wiki(s)ource| wiki(v)ersity)\.(?:com|net|org)/wiki/ ([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\2\3\4\5\6\7:\1:\8|\9]]', text) text = re.sub(r'''(?ix)\[http://(meta|commons|incubator|quality) \.wikimedia\.(?:com|net|org)/wiki/ ([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[\1:\2|\3]]', text) text = re.sub(r'''(?ix)\[http://([a-z0-9\-]+)\.wikia\.(?:com|net|org)/wiki/ ([^][{|}\s"]*)[| ]+([^\n\]]+)\]''', r'[[wikia:\1:\2|\3]]', text) # Reverse interwiki map # [0-9A-Za-z\-.:_] not escaped # [;:@$!*(),/] are converted back in GlobalFunctions.php # [_#\'\\^`~] are assumed to be safe #conflict = {} for (prefix, map) in interwiki_map.iteritems(): # Expensive overlap test #if map in conflict: # print("Collision in interwiki map [[%s:]] and [[%s:]] on %s<br/>" % (prefix, conflict[map], map)) #else: # conflict[map] = prefix #for a,b in interwiki_map.iteritems(): # if b.find(map) == 0 and a != prefix: # print("Overlap between interwiki map [[%s:]] (%s) and [[%s:]] (%s)<br/>" % (prefix, map, a, b)) text = re.sub(r'\[%s +([^\n\[\]]+)\]'%re.escape(map).replace('\\$1', r'([0-9A-Za-z\-.;;:@$!*(),/_#\'\\^`~]*)'), r"[[%s:\1|\2]]"%prefix, text) return text def canonicalTitle(title, ucfirst=True): # TODO rename keyword """ Converts unicode or bytes string to mw titles support: percent-encoded UTF-8, HTML character references """ try: title = unicode(title) except: try:title = unicode(title, 'utf-8') except:title = unicode(title, 'latin-1') # HTML character references title = wikipedia.html2unicode(title) # Unpercent-encode title = unicode(wikipedia.urllib.unquote(title.encode('utf-8')), 'utf-8') # Underscore to space and Strip space title = title.replace('_', ' ').strip() # Merge multiple spaces while ' ' in title: title = title.replace(' ', ' ') # First uppercase if ucfirst and title: title = title[0].upper() + title[1:] # Strip the section part if '#' in title: title = title[:title.index('#')] return title def simplifyLinks(page, text): def dot2percent(m): return m.group().replace('.', '%') # Prettify links, remove underscore and decode characters for m in re.finditer(ur'\[\[([^[\]{|}\n]+)\|([^\n|]*?)\]\]', text): link = m.group(1).replace('_', ' ').encode('utf-8') if '#' in link: title, anchor = link.split('#', 1) anchor = anchor.replace('%', '.25') anchor = re.sub(r'''(?x) # Single byte character (Printable ASCII) # we make that [0-9A-Za-z\-.:_] and [[\]{|}] are not included \.2[1-9A-CF] |\.3[BD-F] # We need to avoid encoding <tag> and </tag> |\.3C(?!\w|/\w|\.2F\w) |\.40 |\.5[CE] |\.60 |\.7E # skip .8-B\h # Two byte UTF-8 character U+0080-U+07FF |\.[CD][0-9A-F]\.[89AB][0-9A-F] # Three byte UTF-8 character U+0800-U+FFFF |\.E[0-9A-F]\.[89AB][0-9A-F]\.[89AB][0-9A-F] # Four byte UTF-8 character U+10000-U+10FFFF |\.F[0-7]\.[89AB][0-9A-F]\.[89AB][0-9A-F]\.[89AB][0-9A-F] ''', dot2percent, anchor) link = ''.join((title, '#', anchor)) link = urllib.unquote(link) # unescape %xx # Specific formating if link.startswith('tools:'):link = link.replace(' ', '_') link = link.replace('# ', '#') # get ride of copy/paste space link = unicode(link, 'utf-8') #if m.group(2)[0:1].islower(): #if m.group(1) != link if not any((s.isupper() for s in link[1:])) and not any((s.isupper() for s in m.group(2))): if re.search(r'(?i)\[\[(\w{3,})\w{0,3}[()_ |[\]].*?\b\1', m.group()): # Come up with better huristics link = link[0].lower() + link[1:] text = text.replace(m.group(), '[[%s|%s]]'%(link, m.group(2))) # Simplify links # FIXME use canonicalTitle # [[A|AB]] -> [[A]]B text = re.sub(ur'(?u)\[\[([^{|}[\]]+)\|\1(\w*)\]\]', ur'[[\1]]\2', text) ## A[[ABC|B]]C -> [[ABC]] #text = re.sub(ur'(?ui)([^{|}[\]]* *) *\[\[ *\1([^{|}[\]]+ *)( *[^{|}[\]]*) *\| *\2\]\]\3', ur'[[\1\2\3]]', text) # TODO # unbypass redirect change [[Light_cycle#Light_cycles]] and [[Tron_(film)#Light_cycles]] to the redirect [[Light cycle]] # find redirect such that A [[Article |B]] C to [[A B C]] return text def fixReferences(page, text): # Standardize to lowercase reference name, makes things easier for everyone text = re.sub(r'(?uis)<(/?)REF\b([^>]*)>', r'<\1ref\2>', text) # it should be name = " or name=" NOT name =" text = re.sub(r'<ref +name(= *| *=)"', r'<ref name="', text) # Remove puncutation between start/end of ref/templates (}}.</ref>) text =re.sub(r'(<ref[^>]*>\s*)[,.?:;~!]+\s*(?=\{\{)', r'\1', text) text = re.sub(r'(\{\{[^{}]{40,}\}\})\s*[,.?:;~!]+(?=\s*</ref>)', r'\1', text) # Leaving out the http:// text = re.sub(r'(?<=<ref>)\s*([a-z0-9\-\.]*?[a-z0-9\-]+\.[a-z\.]{2,6}/[^][<>\s"]+)\s*(?=</ref>)', r'http://\1', text) text = re.sub(r'(?<=<ref>)\s*\[?(?:http://)?([a-z0-9\-\.]*?[a-z0-9\-]+\.[a-z\.]{2,6}/[^][<>\s"|]+) +([^][{|}<>\n/]+?)\]?\s*(?=</ref>)', r'[http://\1 \2]', text) # TODO: Fix the below [ref] to <ref>[url]</ref> conversion text = re.sub(r'(?is)<ref\s*>\s*(\[\w+://[^][<>"\s]+\s*\])\s*(\[\w+://[^][<>"\s]+\s*\])\s*</ref\s*>', r'<ref>\1</ref><ref>\2</ref>', text) ## Badly formed references # Fake reference (<sup>[url]</sup>) text = re.sub(r'(?i)<sup *>\s*\[(\w+://[^][<>"\s]+) *\]\s*</sup>', r'<ref>\1</ref>', text) # Bracket to reference conversion # BUG matches <!-- [http://link/index ] --> for i in range(8): #text = re.sub(r'(?miu)(^[^*#;:= ]{1,4}.{4,}?)(?<![*#]{3})(?<!PDFlink\|)(?<!PDF\|)(?<![(])\[((?:http|https|ftp)://[0-9a-z\-\.:]+/[^[\]<>\s"]{8,})\s*\](?![^-]*-->)(?!([^<]|<(?!ref))*</ref>)', r'\1<ref>\2</ref>', text) # testing text = re.sub(r'(?miu)(^[^#;:= ]{1,4}.{4,}?)(?<=[^*#]{15})(?<!PDFlink\|)(?<!PDF\|)(?<![(])\[((?:http|https|ftp)://[0-9a-z\-\.:]+/[^[\]<>\s"]{8,})\s*\](?![^-]*-->)(?!([^<]|<(?!ref))*</ref>)', r'\1<ref>\2</ref>', text) # remove invalid references text = re.sub(r'(?i)<ref> *</ref>', '', text) ## Format Punctuation # Applied if "[,.;:]<ref/>" is dominate if len(re.findall(r'[.,;:] *\s?<ref', text)) > len(re.findall(r'(?:</ref>|<ref [^</>]+/>) *\s?[.,;:]', text)): # Move punctuation left and space right but before \n text = re.sub(r'(?s)(?<=[\w")\]])( *)((?: *\s??<ref [^>]+?/>| *\s??<ref[^>]*?>(?:[^<]|<(?!/?ref>))*?</ref>)+)( *)\n?([.,]|(?<!\n)[;:])(?![.,;:])(\s??)( *)', r'\4\2\1\6\5\3', text) # Move space to the right, if there's text to the right text = re.sub(r'(?s)(?<=[.,;:"])( +)((?: *\s??<ref [^>]+?/>| *\s??<ref[^>]*?>(?:[^<]|<(?!/?ref>))*?</ref>)+)(?= *\s?[^\s<>])', r'\2\1', text) # Remove duplicate punctuation text = re.sub(r'(?s)(?P<punc>[.,;:])(["]?(?:<ref [^>]+?/> *\s?|<ref[^>]*?>([^<]|<(?!/?ref>))*?</ref> *\s?)+)(?P=punc)(?![.,]|(?<!\n)[:;])', r'\1\2', text) # Remove spaces between references text = re.sub(r'(</ref>|<ref [^>]+?/>) +(<ref)', r'\1\2', text) # Add two space if none, reduce to two if more # trim or add whitespace after <ref /> text = re.sub(r'(</ref>|<ref [^>]+?/>)()((\'{2,5}|)[\w"(\[])', r'\1 \3', text) text = re.sub(r'(</ref>|<ref [^>]+?/>)( {3,})([\w(\[])', r'\1 \3', text) elif len(re.findall(r'(?:</ref>|<ref [^</>]+/>) *\s?[.,;:]', text)) > 10: wikipedia.output('\03{lightyellow}ALERT\03{default}: Punctuation after the references is the dominate format!') # Merge duplicate refs # TODO seperate reference group from naming for m in re.finditer(r'(?si)(<ref>)(.*?)(</ref>)', text): # Skip single references if text.count(m.group()) <= 1: continue # Get a meaningful word part for p in (r'\|\s*last\s*=(\w+)', # Reference template: | last = LASTNAME r'[Bb][Yy] +[A-Z][a-z]+ +([A-Z][a-z]+)[.,\'"]', r'^((?:Mc|)[A-Z][a-z])[,.]', # First word, must be capitalized and followed by punctuation r'(?s)\w+://[a-z0-9\-\.]*?([a-z0-9\-]+)\.[a-z\.]{2,6}[ /|=!]', # Website DOMAIN r'(?s)^(?:\[\[[^][]+\|)?((?<![{])(?<=\W)\b\w+)[,. ].*?(\d{2,4}\b)', # [[Author|McAuthor]] p. 25 r'(?si)\{\{.*?\|(\w*?)\|.*\}\}', # EXPERIMENTAL: {{Harvnb|McCann|1999|p=247}} ): match = re.search(p, re.sub(r'accessdate\s*=[^{|}]*|Retrieved [\s\w\[\],]+', ' ', m.group(2)), re.UNICODE) if match and len(match.group(1)) > 4 and match.group(1).lower() not in ignoreAsNames: refname = match.group(1) break else: refname = 'autogenerated' # Default name # try for the longest Capitalized word for n in re.findall(r'\b(?:Mc)?[A-Z][a-z]+\b', re.sub(r'\|[^{|}=]+=|\{\{[^{|}]+\||\[\[^][|]+\|', ' ', m.group(2) )): if len(n) > len(refname): refname = n # Remove non-letters to avoid names like "rescue007" refname = refname.strip('\t\r\n 0123456789-').lower() # Get a number for p in (r'\|\s*(?:pages|page|p|pp)\s*=\s*(\d+)', r'\b(?:pages|page|p|pp|pg)[.:= ]*(\d{1,4})\b[\w\s\.\-<&\]]*', r'\|\s*year\s*=\s*(\d{4})', r'\b(19\d\d|200[0-7])\b', r'\b([mclxvi]*[clxvi]{2,6})(?:\b|\.)' ): match = re.search(p, re.sub(r'accessdate\s*=[^{|}]*|Retrieved [\s\w\[\],]+', ' ', m.group(2)) ) if match and refname+match.group(1) not in text: refname = refname+match.group(1) break else: i = 1 while refname+str(i) in text: i+=1 else: refname += str(i) # the replacement name should be 50% smaller if len(m.group(2)) * 0.50 > len(refname) + 8: text = text.replace(m.group(), '<ref name="%s">%s</ref>' % (refname, m.group(2)), 1) text = text.replace(m.group(), '<ref name="%s"/>' % refname) # remove formatting wrappers (adapted from AWB) m = re.search(r'(?i)(<(span|div)( class="(references-small|small|references-2column)"|)>\s*){1,2}\s*<references[\s]?/>(\s*</(span|div)>){1,2}', text) if m and m.group().count('<div') == m.group().count('</div'): cols = re.search(r'((?!-)column-count|-moz-column-count):\s*?(\d+)', m.group()) if "references-2column" in m.group(): text = text.replace(m.group(), '{{reflist|2}}') elif cols: text = text.replace(m.group(), '{{reflist|%s}}' % cols.group(2)) else: text = text.replace(m.group(), '{{reflist}}') # Multicolumn {{Reflist}} # If more than 30 refs, make sure the reference section is multi column if text.count('</ref>') > 30: text = re.sub(r'(?is)(=\s+(<!--.*?-->)*\s*)(\{\{Cleanup-link rot[^{}]*\}\}\s*)?(<references />|\{\{(?:Listaref|Reference|Refs|Reflist|Refs)\|?[134]?\}\})', r'\1{{reflist|colwidth=30em}}', text) elif text.count('</ref>') < 8: text = re.sub(r'(?is)(=\s+)\{\{reflist\|(\d+|colwidth=\d+\w+)\}\}', r'\1{{reflist}}', text) else: pass return text def correctdate(s): pass def wiki_table(match): return match.group() def html_attrib(match): return match.group() ## hideTokens = {} hideRegex = re.compile('|'.join([ r'<!--.*?-->', r'<includeonly>.*?</includeonly>', r'<math>.*?</math>', r'<nowiki>.*?</nowiki>', r'<source .*?</source>', r'<pre.*?</pre>', r'<timeline>.*?</timeline>', r'<gallery.*?>.*?</gallery>', ]), re.I | re.S) def hideText(text): global hideTokens n=111 for m in hideRegex.finditer(text): n+=1 hideTokens[n] = m.group() text = text.replace(m.group(), u'⌊⌊⌊⌊%06d⌋⌋⌋⌋'%n) return text def showText(text): global hideTokens for (key, value) in hideTokens.items(): text = text.replace(u'⌊⌊⌊⌊%06d⌋⌋⌋⌋'%key, value) if re.search(ur'(?u)⌊⌊⌊⌊\d{6,}⌋⌋⌋⌋', text): wikipedia.output("WARNING: Unable to replace all hidden tokens") raise "Please report this problem at [[User talk:Dispenser]]" hideTokens = {} # Empty return text def main(): gen = None namespaces = [] genFactory = pagegenerators.GeneratorFactory() summary = "Applying general fixes for links, HTML, and/or references" for arg in wikipedia.handleArgs(): #elif arg.startswith('-namespace:'): # try: # namespaces.append(int(arg[11:])) # except ValueError: # namespaces.append(arg[11:]) if arg == '-test' or arg.startswith('-test:'): f = open('./text/%s'%(arg[6:].replace('/', '|') or 'Tests.html')) test = unicode(f.read(), 'utf-8') site = wikipedia.getSite() page = wikipedia.Page(site, 'ParserTests') # Disable cgitb disk loggging import cgitb; cgitb.enable() wikipedia.output("Default site: %s"%site.sitename()) # hackist seek wikipedia.showDiff(test, fix(text=test, page=page)) import parser print ''' <table style="table-layout:fixed; width:100%%;"> <tr style="vertical-align:top;"> <td>%s</td> <td>%s</td> </tr> </table>''' % (parser.parser(test).encode('utf-8'), parser.parser(fix(text=test, page=page)).encode('utf-8')) return else: genFactory.handleArg(arg) if not gen: gen = genFactory.getCombinedGenerator() if not gen: wikipedia.showHelp('commonfixes') return for page in gen: try: page.get() except wikipedia.NoPage: wikipedia.output('%s does not exist!' % page.aslink()) continue except wikipedia.IsRedirectPage: wikipedia.output(u'Page %s is a redirect' % page.aslink()) continue text = fix(page=page) if text != page.get(): wikipedia.showDiff(page.get(), text) wikipedia.setAction(summary) page.put(text) else: print 'No changes necessary' if __name__ == "__main__" and wikipedia.handleUrlAndHeader(): try: wikipedia.startContent(form=True) main() finally: wikipedia.endContent() wikipedia.stopme()