')
ref.setPublisher(domain.search(ref.link).group(2), hints=t)
print '
'
print '- Classes
'
print '- '
print ''
for c in mdClass:
list = getClassTextNodes(c, u.unicode, re.I|re.U)
if list and list != []:
wikipedia.output("\"%s\" yields: %r" % (c, [cleanText(re.sub(r'<(/?\w+)[^<>]*>', r'<\1>', s[:200])) for s in list[:10] ]))
print ''
for c in bylClass+dateClass+pubClass:
list = getClassTextNodes(c, u.unicode, re.U)
if list and list != []:
wikipedia.output("\"%s\" yields: %r" % (c, [cleanText(re.sub(r'<(/?\w+)[^<>]*>', r'<\1>', s[:200])) for s in list[:10] ]))
for c in bylClass:
list = getClassTextNodes(c, u.unicode, re.I|re.U)
if list and ref.setAuthor(list):
wikipedia.output('INFORMATION: Setting author as %r from class %r'% (ref.date, c))
break
print '
'
wikipedia.output("DATE:")
for c in dateClass + bylClass:
if ref.setDate( getClassTextNodes(c, u.unicode, re.U) ):
wikipedia.output('Setting date as %r from class %r'% (ref.date, c))
break
else:
#TODO get text from ...
first
if ref.setDate( ref.url ):
wikipedia.output('DATE set from the URL')
# Try get any date out of the page, this can cause it to pickup comment or irrlevent dates in the text
elif ref.setDate( (cleanText(u.unicode),) ):
wikipedia.output('DATE extracted from text, likely is not correct')
elif ref.setDate((u.unicode,)):
wikipedia.output('DATE extracted from raw HTML, very likely not correct')
else:
wikipedia.output('ERROR: DATE extracted failed. Please examine the HTML to find hidden structures for dates. Dates newer than 1 week old are ignored')
for c in pubClass:
list = getClassTextNodes(c, u.unicode, re.I|re.U)
if list and len(list)==1:
ref.setPublisher(list[0])
break
if '
...
LastUpdated
"""
print '- <meta> tags
'
print '- '
for m in re.finditer(r'(?is)\w+)[^<>]+content\s*=\s*(?P
["\']?)(?P[^<"\'>]+)(?P=quote)>', u.unicode):
# eliminate search engine hints - they are of no use to us
# 'PUBDATE', # USAToday.com
# 'author',
if not m.group('name').lower() in ('keywords', 'description', 'robots', 'robot', 'bots'):
wikipedia.output("%s: %r" % (m.group('name'), m.group('content')))
if m.group('name').lower() in (
'pubdate', # nytimes
'doc_date', # cbsnews.com
'LastUpdated', #calmac.co.uk
# seems to be last modified stamp
#'pd', # chess.about.com
):
ref.setDate( [m.group('content'), ] )
elif m.group('name').lower() in (
'author', # globalsecurity.org, # cbsnews.com usually set as "CBSNews"
'alt_author', # cbsnews.com (always Joel Roberts?)
'byl', # nytimes.com
):
ref.byline = m.group('content')
elif m.group('name').lower() in (
'citation_doi', # nytimes.com
):
ref.doi = m.group('content')
elif m.group('name').lower() in (
'geo', # nytimes.com
):
ref.location = m.group('content')
elif m.group('name').lower() in (
'pg', # nytimes.com
):
ref.page = m.group('content')
else:
pass
print '
'
print '- Nodes
'
print '- '
# smh.com.au uses nodes and
for c in ('title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'byline', 'date', 'arttitle'):
contents = re.findall(r'(?uis)<%s[^>]*>\s*(.*?)\s*%s\s*>' % (c, c), u.unicode)
if contents:
wikipedia.output("<%s>: %r"%(c, [re.sub(r'\s+', r' ', re.sub(r'?\w+[^<>]*>', '', s)) for s in contents[:5] ]))
if c in ('h1', 'h2') and len(contents) == 1:
contents[0] = re.sub(r'?\w+[^<>]*>', '', contents[0]).strip()
if not ref.title:
ref.title=contents[0]
elif contents[0] in ref.title and not hasattr(ref, 'publisher'):
ref.title=contents[0]
ref.setPublisher(ref.title.replace(contents[0], '').strip('-\n. :'))
print '
'
# Note the third part really can be anyting
finddoi = re.findall(r'(?u)(?:doi:|=\s*"|dx.doi.org/|hdl.handle.net/)(?P10\.[0-9]{4}/[\w\-\.]{10,})(?=[<>"\s]|[^\w\.\-])', u.unicode)
if not ref.doi and len(finddoi)==1:
print 'adding doi'
ref.doi = finddoi[0]#.group('doi')
contents = re.search(r'\b[Pp]ermalink\b', u.unicode)
if contents:
print '- Analysis
- Page has a Permalink
'
lastMod = headers.getheader('Last-Modified')
if lastMod:
print '- Headers
- Last Modified header: %r
'%lastMod
print '
'
# End console output
print '
'
# Error conditions
if not ref.title:
wikipedia.output(u'%s : No title found...' % ref.link)
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
#TODO move to ref class
if self.titleBlackList.match(ref.title):
wikipedia.output(u'\03{lightred}WARNING\03{default} : Blacklisted title (%s)' % (ref.title))
repl = ref.refLink()
new_text = new_text.replace(match.group(), repl)
continue
ref.transform()
repl = ref.refTitle()
new_text = new_text.replace(match.group(), repl)
if not refsupdate:
try: # KEEP MAINTENANCE UP TO DATE
import maintainer
maintainer.removeBEL(page)
except ImportError:
pass
if new_text == page.get():
wikipedia.output('No changes were necessary in %s'
% page.aslink())
continue
if new_text == commonfixes.fix(page=page, text=page.get()):
wikipedia.output('Only common fixes for %s'
% page.aslink())
continue
if self.norefbot.lacksReferences(new_text, verbose=False):
new_text = self.norefbot.addReferences(new_text)
# [[User talk:Dispenser/Reflinks#2 requests]]
if useTemplates:
new_text=new_text.replace('