#!/usr/bin/python # -*- coding: utf-8 -*- import cgi, urlparse, httplib import cgitb; cgitb.enable(logdir='./logs/') # # # # r'[^<>]*' # webcitation.org class InternetArchiveConsulter: def __init__(self, url): self.url = url def getArchiveURL(self): import urllib2 #print u'Consulting the Internet Archive for %s' % self.url archiveURL = 'http://web.archive.org/web/20010101-*/%s' % self.url try: f = urllib2.urlopen(archiveURL) except urllib2.HTTPError, e: # The Internet Archive yields a 403 error when the site was not # archived due to robots.txt restrictions. if e.code == 403: return 'Internet Archive results are blocked' elif e.code == 404: return """No matches from the Internet Archive (begins 2002)""" elif e.code == 501: return """Failed connection -- Internet Archive, more""" else: return "Something when wrong, got error code %s" % e.code except urllib2.URLError, e: print 'We failed to reach a server.' print 'Reason: ', e.reason return None except UnicodeEncodeError: return None if f.info().get('Content-Encoding') in ('gzip', 'x-gzip'): import StringIO, gzip text = gzip.GzipFile(fileobj=StringIO.StringIO(f.read())).read() else: text = f.read() # Update {{dead link}} to w/ history print """""" print 'Internet Archive\'s Wayback Machine has some results (* indicate changes)' iBegin = text.index('
Apperances in %r namespace:
' print 'HEAD %s' % url statusTuple = checkLink(url, useHEAD) print '''HTTP/%g %d %s''' % (statusTuple[0]/10.0, statusTuple[1], statusTuple[2], statusTuple[3]) if statusTuple[5] != []: import re, difflib d = difflib.Differ() p = re.compile(r'(\W)') redirect = statusTuple[5].pop() cmpr = d.compare(p.split(url), p.split(redirect) ) htmlurl_1 ='' htmlurl_2 ='' for s in cmpr: if s[0] == '-': htmlurl_1 += '%s' % s[2:] elif s[0] == '+': htmlurl_2 += '%s' % s[2:] elif s[0] == ' ': htmlurl_1 += s[2:] htmlurl_2 += s[2:] elif s[0] == '?': htmlurl_1 += '' else: print list(cmpr) raise print '----\n%s\n redirects to\n%s' % (htmlurl_1.replace('', ''), htmlurl_2.replace('', '')) #DO NOT BLINDLY CLICK, please make sure that the HTTP redirect is actually correct before clicking!
print """
![]() |
Redirects or moves are normal operations of a server. A well designed site will redirect users from older URLs to the newest URL of the month; as intentional with permalinks. At times, however, they redirect to advertisement, login, soft 404, and error pages. Read more. |
WebCite archive may exist -- Search
" % url else: print "" # for m in re.finditer(r'http://web.archive.org/web/(\d{15})/[^<">]+', results): # # pass if statusTuple[5] == [] and not vars().has_key('redirect') and statusTuple[1]==200 and form.getfirst('archivesearch','') in ('dead-0', ) and url.encode('utf-8')+'\n' not in open('/home/dispenser/webcite_requests.txt','r') and not ( 'toolserver.org' in url or 'wikipedia.org' in url or 'archive.org' in url or 'www.google.com' in url or 'youtube.com' in url or 'webcitation.org' in url ) and statusTuple[3].getheader('Cache-Control', '') != '': if statusTuple[3].getheader('Content-Type', '') == 'application/pdf': import webcite wcu = webcite.WebCite(url) results = wcu.search() if results == []: f = open('/home/dispenser/webcite_requests.txt', 'a') f.write(url.encode('utf-8')+'\n') f.close() ema = "".join((chr(ord(s)^7) for s in 'cntwbitbuGshhktbuqbu)hu`')) s = wcu.archive(ema).read() print "Requesting archiving of Portable Document Format file![]() |
WebCite offers on demand archiving, allowing preservation of cited material the same as the day it was archived. This service will not work for sites who have explicitly opted out using the robots.txt, no-cache / no-archive, or have blocked the WebCite robot. Read more... Please view content before archiving to ensure there were no false negatives in Checklinks |