#!/usr/bin/python # -*- coding: utf-8 -*- import cgi, urlparse, httplib import cgitb; cgitb.enable(logdir='./logs/') # # # # r'[^<>]*' # webcitation.org class InternetArchiveConsulter: def __init__(self, url): self.url = url def getArchiveURL(self): import urllib2 #print u'Consulting the Internet Archive for %s' % self.url archiveURL = 'http://web.archive.org/web/20010101-*/%s' % self.url try: f = urllib2.urlopen(archiveURL) except urllib2.HTTPError, e: # The Internet Archive yields a 403 error when the site was not # archived due to robots.txt restrictions. if e.code == 403: return 'Internet Archive results are blocked' elif e.code == 404: return """No matches from the Internet Archive (begins 2002)""" elif e.code == 501: return """Failed connection -- Internet Archive, more""" else: return "Something when wrong, got error code %s" % e.code except urllib2.URLError, e: print 'We failed to reach a server.' print 'Reason: ', e.reason return None except UnicodeEncodeError: return None if f.info().get('Content-Encoding') in ('gzip', 'x-gzip'): import StringIO, gzip text = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() else: text = f.read() # Update {{dead link}} to w/ history print """""" print 'Internet Archive\'s Wayback Machine has some results (* indicate changes)' iBegin = text.index('') iEnd = text.index('
', iBegin) + len('') return text[iBegin:iEnd].replace('Use  ]+', results): yield m.group(1), m.group(0) def findothers(url): namespace_names = { 0: '(article)', 1: u'Talk', 2: u'User', 3: u'User_talk', 4: u'Wikipedia', 5: u'Wikipedia_talk', 6: u'File', 7: u'File_talk', 8: u'MediaWiki', 9: u'MediaWiki_talk', 10: u'Template', 11: u'Template_talk', 12: u'Help', 13: u'Help_talk', 14: u'Category', 15: u'Category_talk', } try: import MySQLdb wiki = 'enwiki' db = MySQLdb.connect(db=wiki+'_p', host=wiki.replace('_', '-') + '-p.db.toolserver.org', read_default_file="/home/dispenser/.my.cnf") c = db.cursor() c.execute(""" SELECT page_namespace, page_title FROM page JOIN externallinks ON page_id = el_from WHERE el_to LIKE %s ORDER BY page_namespace """, (url+"%", )) results = list(c.fetchall()) if len(results) <= 2: return print '
' prev = None for (ns, title) in results: if ns == prev: print '' else: prev = ns if ns != None: print '' print '

Apperances in %r namespace:

' print '
External links search (for double checking)'% ('en.wikipedia.org', url) print '
' except: pass def checkLink(location, useHEAD = True, counter = 7, redirectChain = []): try: while (counter >= 0 and location is not None): (scheme, site, path, query, frag) = urlparse.urlsplit(location) if query != '': query = '?' + query if path == '': path = '/' if scheme == "http": conn = httplib.HTTPConnection(site) elif scheme == "https": conn = httplib.HTTPSConnection(site) else: return (True, -1, 'Unsupported Protocol', None, '', redirectChain) conn.set_debuglevel(0) #socket.setdefaulttimeout(30) try: request = path.encode('ascii') + query.encode('ascii') except UnicodeEncodeError: encoding = 'utf-8' noencode = '~!^*()_-=&/|,.?;' request = unicode(urllib.quote(path.encode(encoding) + query.encode(encoding), noencode)) if useHEAD: conn.request('HEAD', request) else: conn.request('GET', request) response = conn.getresponse() redirect = response.msg.getheader('location') # It more failsafe if we use a try statement otherwise we could simply test if useHEAD was set if not useHEAD: text = response.read() else: text = '' conn.close() counter -= 1 if redirect: print unicode( u'HTTP %s Move: %s' % (response.status, redirect) ) if(redirect.startswith("http")): location = redirect else: location = urlparse.urljoin(location, redirect) redirectChain.append(location) else: location = None return (response.version, response.status, response.reason, response.msg, text, redirectChain) except Exception, e: # catches those weird ones print u'Exception raised: %s' % e return (0, 0, "Exception %s" % e, None, '', redirectChain) def main(): form = cgi.FieldStorage() # print '
' url = form.getfirst('url', '').replace(' ', '+') useHEAD = (form.getfirst('method', "HEAD")=="HEAD") print '
'
	print 'HEAD %s' % url
	statusTuple = checkLink(url, useHEAD)
	print '''HTTP/%g %d %s''' % (statusTuple[0]/10.0, statusTuple[1], statusTuple[2], statusTuple[3])
	
	if statusTuple[5] != []:
		import re, difflib
		d = difflib.Differ()
		p = re.compile(r'(\W)')
		redirect = statusTuple[5].pop()
		cmpr = d.compare(p.split(url), p.split(redirect) )
		
		htmlurl_1 =''
		htmlurl_2 =''
		for s in cmpr:
			if s[0] == '-':
				htmlurl_1 += '%s' % s[2:]
			elif s[0] == '+':
				htmlurl_2 += '%s' % s[2:]
			elif s[0] == ' ':
				htmlurl_1 += s[2:]
				htmlurl_2 += s[2:]
			elif s[0] == '?':
				htmlurl_1 += ''
			else:
				print list(cmpr)
				raise
		print '----\n%s\n redirects to\n%s' % (htmlurl_1.replace('', ''), htmlurl_2.replace('', ''))
#

DO NOT BLINDLY CLICK, please make sure that the HTTP redirect is actually correct before clicking!

print """
Redirects or moves are normal operations of a server. A well designed site will redirect users from older URLs to the newest URL of the month; as intentional with permalinks. At times, however, they redirect to advertisement, login, soft 404, and error pages. Read more.
""" % redirect else: print '' #TODO add parameter to activate wayback search if (statusTuple[1] and int(statusTuple[1]/100) == 4) or statusTuple[1]==0 or form.getfirst('archivesearch', '') in ('dead-4', 'dead-5', 'yes', 'true'): print iac = InternetArchiveConsulter(url) results = iac.getArchiveURL() print results # TODO actually search webcite! if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'): print "

WebCite archive may exist -- Search

" % url else: print "" # for m in re.finditer(r'http://web.archive.org/web/(\d{15})/[^<">]+', results): # # pass if statusTuple[5] == [] and not vars().has_key('redirect') and statusTuple[1]==200 and form.getfirst('archivesearch','') in ('dead-0', ) and url.encode('utf-8')+'\n' not in open('/home/dispenser/webcite_requests.txt','r') and not ( 'toolserver.org' in url or 'wikipedia.org' in url or 'archive.org' in url or 'www.google.com' in url or 'youtube.com' in url or 'webcitation.org' in url ) and statusTuple[3].getheader('Cache-Control', '') != '': if statusTuple[3].getheader('Content-Type', '') == 'application/pdf': import webcite wcu = webcite.WebCite(url) results = wcu.search() if results == []: f = open('/home/dispenser/webcite_requests.txt', 'a') f.write(url.encode('utf-8')+'\n') f.close() ema = "".join((chr(ord(s)^7) for s in 'cntwbitbuGshhktbuqbu)hu`')) s = wcu.archive(ema).read() print "Requesting archiving of Portable Document Format file
" print results elif form.getfirst('archivenow'): import webcite print 'Requesting archiving of url - WebCite' webcite.requestArchiving(url) f = open('/home/dispenser/webcite_requests.txt', 'a') f.write(url.encode('utf-8')+'\n') f.close() else: print """

WebCite offers on demand archiving, allowing preservation of cited material the same as the day it was archived. This service will not work for sites who have explicitly opted out using the robots.txt, no-cache / no-archive, or have blocked the WebCite robot. Read more...

Please view content before archiving to ensure there were no false negatives in Checklinks

""" findothers(url) if __name__ == "__main__": try: print 'Content-type: text/html; charset=UTF-8' print print '' print '' print """ """ main() finally: print ''