\r\n", newTable)
if n>0:
warning_messages.append(u'NOTE: Found | without | . This shouldn\'t cause problems.\n')
# with attributes, with missing |
newTable, n = re.subn("(?i)[\r\n]* [^>]*?)>(?P[\w\W]*?)[\r\n]+",
r"\r\n|\g | \g\r\n", newTable)
if n > 0:
warning_messages.append(u'NOTE: Found | without | . This shouldn\'t cause problems.\n')
##################
# Garbage collecting ;-)
newTable = re.sub("(?i)[\r\n]*<\/tr>", "", newTable)
# delete closing tags
newTable = re.sub("(?i)[\r\n]*<\/t[rdh]>", "", newTable)
##################
# OK, that's only theory but works most times.
# Most browsers assume that | gets a new row and we do the same
# newTable, n = re.subn("([\r\n]+\|\ [^\r\n]*?)([\r\n]+\!)",
# "\\1\r\n|-----\\2", newTable)
# warnings = warnings + n
# adds a |---- below for the case the new | is missing
# newTable, n = re.subn("([\r\n]+\!\ [^\r\n]*?[\r\n]+)(\|\ )",
# "\\1|-----\r\n\\2", newTable)
# warnings = warnings + n
##################
# most | come with '''title'''. Senseless in my eyes cuz
# | should be bold anyways.
newTable = re.sub("[\r\n]+\!([^'\n\r]*)'''([^'\r\n]*)'''",
r"\r\n!\1\2", newTable)
##################
# kills indention within tables. Be warned, it might seldom bring
# bad results.
# True by default. Set 'deIndentTables = False' in user-config.py
if config.deIndentTables:
num = 1
while num != 0:
newTable, num = re.subn("(\{\|[\w\W]*?)\n[ \t]+([\w\W]*?\|\})",
r"\1\r\n\2", newTable)
##################
# kills additional spaces after | or ! or {|
# This line was creating problems, so I commented it out --Daniel
# newTable = re.sub("[\r\n]+\|[\t ]+?[\r\n]+", "\r\n| ", newTable)
# kills trailing spaces and tabs
newTable = re.sub("\r\n(.*)[\t\ ]+[\r\n]+",
r"\r\n\1\r\n", newTable)
# kill extra new-lines
newTable = re.sub("[\r\n]{4,}(\!|\|)",
r"\r\n\1", newTable);
##################
# shortening if had no arguments/parameters
newTable = re.sub("[\r\n]+\{\|[\ ]+\| ", "\r\n\{| ", newTable)
# shortening if | had no articles
newTable = re.sub("[\r\n]+\|[\ ]+\| ", "\r\n| ", newTable)
# shortening if | had no articles
newTable = re.sub("\n\|\+[\ ]+\|", "\n|+ ", newTable)
# shortening of had no articles
newTable = re.sub("[\r\n]+\![\ ]+\| ", "\r\n! ", newTable)
##################
# proper attributes. attribute values need to be in quotation marks.
num = 1
while num != 0:
# group 1 starts with newlines, followed by a table or row tag
# ( {| or |--- ), then zero or more attribute key - value
# pairs where the value already has correct quotation marks, and
# finally the key of the attribute we want to fix here.
# group 2 is the value of the attribute we want to fix here.
# We recognize it by searching for a string of non-whitespace characters
# - [^\s]+? - which is not embraced by quotation marks - [^"]
newTable, num = re.subn(r'([\r\n]+(?:\|-|\{\|)[^\r\n\|]+) *= *([^"\s>]+)',
r'\1="\2"', newTable, 1)
num = 1
while num != 0:
# The same for header and cell tags ( ! or | ), but for these tags the
# attribute part is finished by a | character. We don't want to change
# cell contents which accidentially contain an equal sign.
# Group 1 and 2 are anologously to the previous regular expression,
# group 3 are the remaining attribute key - value pairs.
newTable, num = re.subn(r'([\r\n]+(?:!|\|)[^\r\n\|]+) *= *([^"\s>]+)([^\|\r\n]*)\|',
r'\1="\2"\3|', newTable, 1)
##################
# merge two short s
num = 1
while num != 0:
newTable, num = re.subn("[\r\n]+(\|[^\|\-\}]{1}[^\n\r]{0,35})" +
"[\r\n]+(\|[^\|\-\}]{1}[^\r\n]{0,35})[\r\n]+",
r"\r\n\1 |\2\r\n", newTable)
####
# add a new line if first is * or #
newTable = re.sub("[\r\n]+\| ([*#]{1})",
r"\r\n|\r\n\1", newTable)
##################
# strip from
newTable = re.sub("([\r\n]+\![^\r\n]+?)([\w\W]+?)<\/center>",
r"\1 \2", newTable)
# strip align="center" from because the .css does it
# if there are no other attributes than align, we don't need that | either
newTable = re.sub("([\r\n]+\! +)align\=\"center\" +\|",
r"\1", newTable)
# if there are other attributes, simply strip the align="center"
newTable = re.sub("([\r\n]+\![^\r\n\|]+?)align\=\"center\"([^\n\r\|]+?\|)",
r"\1 \2", newTable)
##################
# kill additional spaces within arguments
num = 1
while num != 0:
newTable, num = re.subn("[\r\n]+(\||\!)([^|\r\n]*?)[ \t]{2,}([^\r\n]+?)",
r"\r\n\1\2 \3", newTable)
##################
# I hate those long lines because they make a wall of letters
# Off by default, set 'splitLongParagraphs = True' in user-config.py
if config.splitLongParagraphs:
num = 1
while num != 0:
# TODO: how does this work? docu please.
# why are only äöüß used, but not other special characters?
newTable, num = re.subn("(\r\n[A-Z]{1}[^\n\r]{200,}?[a-zäöüß]\.)\ ([A-ZÄÖÜ]{1}[^\n\r]{200,})",
r"\1\r\n\2", newTable)
return newTable, warnings, warning_messages
def markActiveTables(self, text):
"""
Marks all table start and end tags that are not disabled by nowiki
tags, comments etc.
We will then later only work on these marked tags.
"""
tableStartTagR = re.compile("", re.IGNORECASE)
text = wikipedia.replaceExcept(text, tableStartTagR, "<##table##", exceptions = ['comment', 'math', 'nowiki', 'pre', 'source'])
text = wikipedia.replaceExcept(text, tableEndTagR, "##table##>", exceptions = ['comment', 'math', 'nowiki', 'pre', 'source'])
return text
def findTable(self, text):
"""
Finds the first HTML table (which can contain nested tables) inside a
text.
Returns the table and the start and end position inside the text.
"""
# Note that we added the ## characters in markActiveTables().
markedTableStartTagR = re.compile("<##table##", re.IGNORECASE)
markedTableEndTagR = re.compile("##table##>", re.IGNORECASE)
m = markedTableStartTagR.search(text)
if not m:
return None, 0, 0
else:
start = m.start()
offset = m.end()
originalText = text
text = text[m.end():]
# depth level of table nesting
depth = 1
#i = start + 1
while depth > 0:
nextStarting = markedTableStartTagR.search(text)
nextEnding = markedTableEndTagR.search(text)
if not nextEnding:
print "More opening than closing table tags. Skipping."
return None, 0, 0
# if another table tag is opened before one is closed
elif nextStarting and nextStarting.start() < nextEnding.start():
offset += nextStarting.end()
text = text[nextStarting.end():]
depth += 1
else:
offset += nextEnding.end()
text = text[nextEnding.end():]
depth -= 1
end = offset
return originalText[start:end], start, end
def convertAllHTMLTables(self, text):
'''
Converts all HTML tables in text to wiki syntax.
Returns the converted text, the number of converted tables and the
number of warnings that occured.
'''
text = self.markActiveTables(text)
convertedTables = 0
warningSum = 0
warningMessages = u''
while True:
table, start, end = self.findTable(text)
if not table:
# no more HTML tables left
break
print ">> Table %i <<" % (convertedTables + 1)
# convert the current table
newTable, warningsThisTable, warnMsgsThisTable = self.convertTable(table)
# show the changes for this table
if self.debug:
print table
print newTable
elif not self.quietMode:
wikipedia.showDiff(table.replace('##table##', 'table'), newTable)
print ""
warningSum += warningsThisTable
for msg in warnMsgsThisTable:
warningMessages += 'In table %i: %s' % (convertedTables + 1, msg)
text = text[:start] + newTable + text[end:]
convertedTables += 1
wikipedia.output(warningMessages)
return text, convertedTables, warningSum
def treat(self, page):
'''
Loads a page, converts all HTML tables in its text to wiki syntax,
and saves the converted text.
Returns True if the converted table was successfully saved, otherwise
returns False.
'''
wikipedia.output(u'\n>>> %s <<<' % page.title())
site = page.site()
try:
text = page.get()
except wikipedia.NoPage:
wikipedia.output(u"ERROR: couldn't find %s" % page.title())
return False
except wikipedia.IsRedirectPage:
wikipedia.output(u'Skipping redirect %s' % page.title())
return False
newText, convertedTables, warningSum = self.convertAllHTMLTables(text)
# Check if there are any marked tags left
markedTableTagR = re.compile("<##table##|##table##>", re.IGNORECASE)
if markedTableTagR.search(newText):
wikipedia.output(u'ERROR: not all marked table start or end tags processed!')
return
if convertedTables == 0:
wikipedia.output(u"No changes were necessary.")
else:
if config.table2wikiAskOnlyWarnings and warningSum == 0:
doUpload = True
else:
if config.table2wikiSkipWarnings:
doUpload = True
else:
print "There were %i replacement(s) that might lead to bad output." % warningSum
doUpload = (wikipedia.input(u'Do you want to change the page anyway? [y|N]') == "y")
if doUpload:
# get edit summary message
if warningSum == 0:
wikipedia.setAction(wikipedia.translate(site.lang, msg_no_warnings))
elif warningSum == 1:
wikipedia.setAction(wikipedia.translate(site.lang, msg_one_warning) % warningSum)
else:
wikipedia.setAction(wikipedia.translate(site.lang, msg_multiple_warnings) % warningSum)
page.put_async(newText)
def run(self):
for page in self.generator:
self.treat(page)
def main():
quietMode = False # use -quiet to get less output
# if the -file argument is used, page titles are stored in this array.
# otherwise it will only contain one page.
articles = []
# if -file is not used, this temporary array is used to read the page title.
page_title = []
debug = False
# Which namespaces should be processed?
# default to [] which means all namespaces will be processed
namespaces = []
xmlfilename = None
gen = None
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
genFactory = pagegenerators.GeneratorFactory()
for arg in wikipedia.handleArgs():
if arg.startswith('-xml'):
if len(arg) == 4:
xmlfilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
else:
xmlfilename = arg[5:]
gen = TableXmlDumpPageGenerator(xmlfilename)
elif arg == '-sql':
query = u"""
SELECT page_namespace, page_title
FROM page JOIN text ON (page_id = old_id)
WHERE old_text LIKE '% | | | | | | | | |