#!/usr/bin/python # -*- coding: utf-8 -*- import cgi, urlparse, httplib import re import cgitb; cgitb.enable(logdir='./tracebacks/') import sys noarchive = ( 'archive.org', 'toolserver.org', 'wikipedia.org', 'archive.org', 'www.google.com', 'youtube.com', 'webcitation.org', ) # Messages redirectmsg = """
Redirects or moves are normal operations of a server. A well designed site will redirect users from older URLs to the newest URL of the month; as intentional with permalinks. At times, however, they redirect to advertisement, login, soft 404, and error pages. Read more.
""" webcitemsg = """

WebCite offers on demand archiving, allowing preservation of cited material the same as the day it was archived. This service will not work for sites who have explicitly opted out using the robots.txt, no-cache / no-archive, or have blocked the WebCite robot. Read more...

Please view content before archiving to ensure there were no false negatives in Checklinks

""" webcitearchivedmsg = """

An archive request was sent to WebCite.

$1
""" # for m in re.finditer(r'http://web.archive.org/web/(\d{15})/[^<">]+', results): # r'[^<>]*' # webcitation.org class InternetArchiveConsulter: def __init__(self, url): self.url = url def getArchiveURL(self): import urllib2 #print u'Consulting the Internet Archive for %s' % self.url archiveURL = 'http://web.archive.org/web/20010101-*/%s' % self.url try: f = urllib2.urlopen(archiveURL) except urllib2.HTTPError, e: # The Internet Archive yields a 403 error when the site was not # archived due to robots.txt restrictions. if e.code == 403: return 'Internet Archive results are blocked' elif e.code == 404: return """No matches from the Internet Archive""" elif e.code == 501: return """Failed connection -- Internet Archive, more""" elif e.code == 503: print 'try: %s'%(archiveURL,archiveURL) return "503 error, do you know what it is?" else: return "Something when wrong, got error code %s" % e.code except urllib2.URLError, e: print 'We failed to reach a server (Internet Archive).', '
' print 'Reason: ', e.reason, '' return None except UnicodeEncodeError: return None if f.info().get('Content-Encoding') in ('gzip', 'x-gzip'): import StringIO, gzip text = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() else: text = f.read() # Update {{dead link}} to w/ history print """""" print 'Internet Archive\'s Wayback Machine has some results (* indicate changes)' iBegin = text.index('') iEnd = text.index('
', iBegin) + len('') return text[iBegin:iEnd].replace('Use  ]+', results): yield m.group(1), m.group(0) def findothers(url, wiki='enwiki'): import MySQLdb db = MySQLdb.connect(db=wiki+'_p', host=wiki.replace('_', '-') + '-p.db.toolserver.org', read_default_file="/home/dispenser/.my.cnf") c = db.cursor() c.execute(""" SELECT domain, ns_name, page_title FROM page JOIN toolserver.namespace JOIN externallinks ON page_id = el_from WHERE el_to = %s /* namespace name */ AND dbname = %s AND ns_id = page_namespace ORDER BY page_namespace """, (url, wiki+'_p',)) results = list(c.fetchall()) if len(results) <= 2: return print '
' prev = None base = 'http://en.wikipedia.org/wiki' suppresseded = False for (domain, ns, title,) in results: base = 'http://%s/wiki'%domain if ns != prev: if ns != None: print '' if ns == '': print '

Articles also using this link:

' c.close() def accessdate(url): import os listofaccessed = '/home/dispenser/accessed/' files = os.listdir(listofaccessed) files.sort() for filename in files: if '\t'+url.encode('utf-8')+'\t' in open(listofaccessed + filename, 'r').read(): print "

Checklinks first sucessfully accessed this url on %s

"%filename[:-4] break def linktrack(location, useHEAD = True, counter = 7, redirectChain = []): try: print '
'
		print 'HEAD %s' % location
		while (counter >= 0 and location is not None):
			address = urlparse.urlsplit(location)
			if address.scheme == "http":
				conn = httplib.HTTPConnection(address.hostname)
			elif address.scheme == "https":
				conn = httplib.HTTPSConnection(address.hostname)
			else:
				return (None, 'Unsupported Protocol', redirectChain)
			conn.set_debuglevel(0)
			httplib.socket.setdefaulttimeout(20)
			
			path  = address.path or '/'
			query = address.query and '?' + address.query or ''

			# FIXME http://www.kurnik.pl/slownik/sp.phtml?sl=Gar%B3uch
			try:
		#	if type(path) == type(query) == type(''):
				request = path.encode('ascii') + query.encode('ascii')
			except UnicodeEncodeError:
				encoding = 'utf-8'
				noencode = '~!^*()_-=&/|,.?;'
				import urllib
				request = urllib.quote(path.encode(encoding) + query.encode(encoding), noencode)
			except UnicodeDecodeError:
				import urllib
				reqeust = urllib.quote(path+query,  '~!^*()_-=&/|,.?;')
				print request
				
			conn.request(useHEAD and 'HEAD' or 'GET', request)
			
			response = conn.getresponse()
			redirect = response.msg.getheader('location')
			# It more failsafe if we use a try statement otherwise we could simply test if useHEAD was set
			if not useHEAD:
				text = response.read()
			else:
				text = ''
			conn.close()
			
			counter -= 1
			if redirect:
#				print ''
#				print 'HTTP %s Move: %s' % (response.status, redirect)
				print '''HTTP/%g %d Move: %s''' % (response.version/10.0, response.status, redirect, response.msg, )
				if(redirect.startswith("http")):
					location = redirect
				else:
					location = urlparse.urljoin(location, redirect)
				redirectChain.append(location)
			else:
				location = None
		print '''HTTP/%g %d %s''' % (response.version/10.0, response.status, response.reason, response.msg, )
		print '
' return (response, text, redirectChain) except httplib.socket.error, arg: print 'Socket error: ', arg print '' return (None, "SOCKET %r" % arg, redirectChain) except Exception, e: # catches those weird ones print u'Exception raised: %s' % e print '' raise return (None, "Exception %s" % e, redirectChain) def main(): form = cgi.FieldStorage() url = form.getfirst('url', '')#.replace(' ', '+') useHEAD = (form.getfirst('method', "HEAD")=="HEAD") if not url: print '
' return (response, text, redirectChain) = linktrack(url, useHEAD) if response is None: print text else: if len(response.msg.getheader('Content-Length', '')) > 6: print "File size is %4.3g MB" % (float(response.msg.getheader('Content-Length', 'error'))/1024.0/1024.0,) # FIXME mOVE INTO THE RIGHT SPOT dbname = form.getfirst('dbname', 'enwiki') # diff view of redirect if redirectChain != []: import difflib d = difflib.Differ() p = re.compile(r'(\W)') redirect = redirectChain.pop() cmpr = d.compare(p.split(url), p.split(redirect) ) htmlurl_1 ='' htmlurl_2 ='' for s in cmpr: if s[0] == '-': htmlurl_1 += '%s' % s[2:] elif s[0] == '+': htmlurl_2 += '%s' % s[2:] elif s[0] == ' ': htmlurl_1 += s[2:] htmlurl_2 += s[2:] elif s[0] == '?': htmlurl_1 += '' else: print list(cmpr) raise print ('
\n%s\n redirects to\n%s
' % (htmlurl_1, htmlurl_2)).replace('', '') print redirectmsg sys.stdout.flush() #TODO add parameter to activate wayback search if response and (int(response.status/100) == 4 or response.status==0) or form.getfirst('archivesearch', '') in ('dead-4', 'dead-5', 'yes', 'true'): # Link is dead # Results from Internet Archive Wayback Machine print iac = InternetArchiveConsulter(url) results = iac.getArchiveURL() print results print """ """ # Results from WebCite import webcite wcu = webcite.WebCite(url) results = wcu.search() for result in results: if result['status'] == 'success': print '
' % (result['webcite_url'], result['timestamp'], ) else: print '', result, '' # if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'): # print "

WebCite archive may exist -- Search

" % url if (redirectChain == [] and not vars().has_key('redirect') and response and response.status == 200 and form.getfirst('archivesearch','') in ('dead-0', '0', ) and not any(( (domain in url) for domain in noarchive)) and response.msg.getheader('Cache-Control', '').lower().find('no-store') == -1 ): import webcite wcu = webcite.WebCite(url) results = wcu.search() if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'): s = '' for result in results: if result['status'] == 'success': s += '
  • ' % (result['webcite_url'], result['timestamp'], ) else: print '', result, '' print webcitearchivedmsg.replace('$1', "WebCite has archive this on: ") elif results != []: print "%s"%results f = open('/home/dispenser/webcite_requests.txt', 'a') f.write(url.encode('utf-8')+'\n') f.close() print '

    Adding WebCite status to internal list. Click again to see new status

    ' print webcitearchivedmsg.replace('$1', "") # elif response.msg.getheader('Content-Type', '') == 'application/pdf': # if results == []: # f = open('/home/dispenser/webcite_requests.txt', 'a') # f.write(url.encode('utf-8')+'\n') # f.close() # # ema = "".join((chr(ord(s)^7) for s in 'cntwbitbuGshhktbuqbu)hu`')) # s = wcu.archive(ema).read() # print "Requesting archiving of Portable Document Format file
    " # print results elif form.getfirst('archivenow'): print 'Requesting archiving of url - WebCite' webcite.requestArchiving(url) f = open('/home/dispenser/webcite_requests.txt', 'a') f.write(url.encode('utf-8')+'\n') f.close() else: print webcitemsg sys.stdout.flush() findothers(url, dbname) accessdate(url) if __name__ == "__main__": try: print 'Content-type: text/html; charset=UTF-8' print print '' print '' print """ """ main() finally: print ''