URL list

<b>#       URLs list</b>
<b>#</b>
blacklist   = re.compile('|'.join([
	r'\d+\.\d+\.\d+\.104/.*cache:', # Google Cache (IP version)
	r'google[^/]*/.*cache:',	# Google Cache, listed as the links unexepected disappear and are often used to repear dead links
	r'cnomy.com',		   # SCO
]), re.IGNORECASE)
<b># Site that need their content checked, waists a bit more bandwidth</b>
deep_analyze_sites = re.compile('|'.join([
	r'oregonlive.com/',
	r'statesmanjournal.com/',
	r'salemmonthly.com/',
	r'courier-journal.com',
	r'telegraph.co.uk', # Dont like use using HEAD requests with Firefox Agent
]), re.IGNORECASE)
<b># Pattern to notify of expiring news publication links</b>
expiringNewsSites = re.compile('|'.join([
	r'://[^/]*(news.yahoo.com/s/ap|sports.yahoo.com/ncaab)',
	r'://[^/]*chicagotribune.com/[ne]{2}',
	r'://[^/]*suntimes.com/',
	r'://[^/]*reuters\..[3]/',
	r'[\./@]ap.org/',
	# list from [[User:Peteforsyth/O-vanish]]
	r'[\./@]oregonlive.com/',
	r'[\./@]statesmanjournal.com/',
	r'[\./@]salemmonthly.com/',
	r'[\./@]miamiherald.com/',
]))
stndAgent =  re.compile('|'.join([
	r'telegraph.co.uk', # Dont like use using HEAD requests with FF agent
]), re.IGNORECASE)
<b># URL to ignore</b>
ignorelist = [
	re.compile(r'.*[\./@]example.(com|net|org)(/.*)?'), # reserved for documentation
	re.compile(r'.*[\./@]archive.org(/.*)?'),
	re.compile(r'.*[\./@]wikipedia.org/wiki/.*'), # People like to link images
	re.compile(r'.*[\./@]video.google.com/.*'), # Issues with the textScore rule, mostly links
	re.compile(r'.*[\./@]tools.wikimedia.(org|de)/.*'), # So we don't end up calling ourself
]
<b># Academic Journals requiring subscriptions</b>
<b># BTW: Could somebody tell my WHY they're using a 404s here!?!</b>
jornals = re.compile('|'.join([
	'nature.com',
	'princetonreview.com',
	'oxforddnb.com',
	'e-publishing.library.cornell.edu',
	'muse.jhu.edu',
	'journals.cambridge.org',
	'jstor.org',
	'dx.doi.org',
	'iop.org',
]), re.IGNORECASE)
Interaction