#!/usr/bin/python # -*- coding: utf-8 -*- import cgi, urlparse, httplib import re import cgitb; cgitb.enable(logdir='./tracebacks/') import sys noarchive = ( 'archive.org', 'toolserver.org', 'wikipedia.org', 'archive.org', 'www.google.com', 'youtube.com', 'webcitation.org', ) # Messages redirectmsg = """
![]() |
Redirects or moves are normal operations of a server. A well designed site will redirect users from older URLs to the newest URL of the month; as intentional with permalinks. At times, however, they redirect to advertisement, login, soft 404, and error pages. Read more. |
![]() |
WebCite offers on demand archiving, allowing preservation of cited material the same as the day it was archived. This service will not work for sites who have explicitly opted out using the robots.txt, no-cache / no-archive, or have blocked the WebCite robot. Read more... Please view content before archiving to ensure there were no false negatives in Checklinks |
![]() |
An archive request was sent to WebCite. $1 |
', e.reason, ''
return None
except UnicodeEncodeError:
return None
if f.info().get('Content-Encoding') in ('gzip', 'x-gzip'):
import StringIO, gzip
text = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read()
else:
text = f.read()
# Update {{dead link}} to w/ history
print """"""
print 'Internet Archive\'s Wayback Machine has some results (* indicate changes)'
iBegin = text.index('Articles also using this link:
In %s space:
Checklinks first sucessfully accessed this url on %s
"%filename[:-4] break def linktrack(location, useHEAD = True, counter = 7, redirectChain = []): try: print '' print 'HEAD %s' % location while (counter >= 0 and location is not None): address = urlparse.urlsplit(location) if address.scheme == "http": conn = httplib.HTTPConnection(address.hostname) elif address.scheme == "https": conn = httplib.HTTPSConnection(address.hostname) else: return (None, 'Unsupported Protocol', redirectChain) conn.set_debuglevel(0) httplib.socket.setdefaulttimeout(20) path = address.path or '/' query = address.query and '?' + address.query or '' # FIXME http://www.kurnik.pl/slownik/sp.phtml?sl=Gar%B3uch try: # if type(path) == type(query) == type(''): request = path.encode('ascii') + query.encode('ascii') except UnicodeEncodeError: encoding = 'utf-8' noencode = '~!^*()_-=&/|,.?;' import urllib request = urllib.quote(path.encode(encoding) + query.encode(encoding), noencode) except UnicodeDecodeError: import urllib reqeust = urllib.quote(path+query, '~!^*()_-=&/|,.?;') print request conn.request(useHEAD and 'HEAD' or 'GET', request) response = conn.getresponse() redirect = response.msg.getheader('location') # It more failsafe if we use a try statement otherwise we could simply test if useHEAD was set if not useHEAD: text = response.read() else: text = '' conn.close() counter -= 1 if redirect: # print '' # print 'HTTP %s Move: %s' % (response.status, redirect) print '''HTTP/%g %d Move: %s''' % (response.version/10.0, response.status, redirect, response.msg, ) if(redirect.startswith("http")): location = redirect else: location = urlparse.urljoin(location, redirect) redirectChain.append(location) else: location = None print '''HTTP/%g %d %s''' % (response.version/10.0, response.status, response.reason, response.msg, ) print '' return (response, text, redirectChain) except httplib.socket.error, arg: print 'Socket error: ', arg print '' return (None, "SOCKET %r" % arg, redirectChain) except Exception, e: # catches those weird ones print u'Exception raised: %s' % e print '' raise return (None, "Exception %s" % e, redirectChain) def main(): form = cgi.FieldStorage() url = form.getfirst('url', '')#.replace(' ', '+') useHEAD = (form.getfirst('method', "HEAD")=="HEAD") if not url: print '' return (response, text, redirectChain) = linktrack(url, useHEAD) if response is None: print text else: if len(response.msg.getheader('Content-Length', '')) > 6: print "File size is %4.3g MB" % (float(response.msg.getheader('Content-Length', 'error'))/1024.0/1024.0,) # FIXME mOVE INTO THE RIGHT SPOT dbname = form.getfirst('dbname', 'enwiki') # diff view of redirect if redirectChain != []: import difflib d = difflib.Differ() p = re.compile(r'(\W)') redirect = redirectChain.pop() cmpr = d.compare(p.split(url), p.split(redirect) ) htmlurl_1 ='' htmlurl_2 ='' for s in cmpr: if s[0] == '-': htmlurl_1 += '%s' % s[2:] elif s[0] == '+': htmlurl_2 += '%s' % s[2:] elif s[0] == ' ': htmlurl_1 += s[2:] htmlurl_2 += s[2:] elif s[0] == '?': htmlurl_1 += '' else: print list(cmpr) raise print ('
\n%s\n redirects to\n%s' % (htmlurl_1, htmlurl_2)).replace('', '') print redirectmsg sys.stdout.flush() #TODO add parameter to activate wayback search if response and (int(response.status/100) == 4 or response.status==0) or form.getfirst('archivesearch', '') in ('dead-4', 'dead-5', 'yes', 'true'): # Link is dead # Results from Internet Archive Wayback Machine print iac = InternetArchiveConsulter(url) results = iac.getArchiveURL() print results print """ """ # Results from WebCite import webcite wcu = webcite.WebCite(url) results = wcu.search() for result in results: if result['status'] == 'success': print '
', result, ''
# if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'):
# print "WebCite archive may exist -- Search
" % url if (redirectChain == [] and not vars().has_key('redirect') and response and response.status == 200 and form.getfirst('archivesearch','') in ('dead-0', '0', ) and not any(( (domain in url) for domain in noarchive)) and response.msg.getheader('Cache-Control', '').lower().find('no-store') == -1 ): import webcite wcu = webcite.WebCite(url) results = wcu.search() if url.encode('utf-8')+'\n' in open('/home/dispenser/webcite_requests.txt','r'): s = '' for result in results: if result['status'] == 'success': s += '' % (result['webcite_url'], result['timestamp'], ) else: print '', result, ''
print webcitearchivedmsg.replace('$1', "WebCite has archive this on: %s"%results
f = open('/home/dispenser/webcite_requests.txt', 'a')
f.write(url.encode('utf-8')+'\n')
f.close()
print 'Adding WebCite status to internal list. Click again to see new status
' print webcitearchivedmsg.replace('$1', "") # elif response.msg.getheader('Content-Type', '') == 'application/pdf': # if results == []: # f = open('/home/dispenser/webcite_requests.txt', 'a') # f.write(url.encode('utf-8')+'\n') # f.close() # # ema = "".join((chr(ord(s)^7) for s in 'cntwbitbuGshhktbuqbu)hu`')) # s = wcu.archive(ema).read() # print "Requesting archiving of Portable Document Format file