#!/usr/bin/python # -*- coding: utf-8 -*- """ Crawl a site and extract all unique URLs for html pages. This script takes one argument: the url to the site to crawl. If you want to store the ouput, pipe it to a file. Usage example (output to console): python crawlsite.py http://www.mysite.com Usage example (output to file in Windows): python crawlsite.py http://www.mysite.com > mylinks.txt This script was written in a haste. Please report errors to pete@standards-schmandards.com This script uses the htmldata library by Connelly Barnes. Please make sure it is available in the same folder. htmldata manual http://oregonstate.edu/~barnesc/htmldata/html/public/htmldata-module.html Originally by Peter Krantz on 2005/04/01 Completely reworked and refactored By Philip Roche phil@philroche.net on 19/10/2006 """ #need to allows form actions import urllib2 import htmldata import httplib import sys import urlparse import codecs import datetime import traceback class crawlsite: __author__ = 'Philip Roche' __version__ = '0.1' __date__ = '2006/10/20' #Setup some basic parameters useragentFirefox = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6) Gecko/20050223 Firefox/1.0.1" useragentIE6 = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)" useragentSelf = "Sitecrawler " + __version__ + " " + __date__ + " by " + __author__ skippedProtocols = ("javascript", "mailto", "ftp", "gopher") validContentTypes = ("text/html", "application/xhtml+xml") #get command line parameters #Starting url url = "" #Get root url urlparts = [] rootUrl = "" #List of parsed urls parsedurls = [] uniqueLinks = {} iNumberOfTimesCalled = 0 iLinksGathered = 0 bParseRedirects = True def __init__(self,urltocrawl = ""): if urltocrawl.strip()=="": if len(sys.argv) < 2: self.url = raw_input("Supply url to crawl: ") else: self.url = sys.argv[1] else: self.url = urltocrawl.strip() self.urlparts = urlparse.urlsplit(self.url) self.rootUrl = self.urlparts[0] + "://" + self.urlparts[1] def crawl(self): self.gatherLinks(self.url,0) return self.uniqueLinks def outputProgress(self,msg): sys.stdout.write(msg) #Is contenttype parsable? def isParsable(self,contentType): result = False for validContentType in self.validContentTypes: if not(contentType == None) and validContentType in contentType: result = True break return result def stripFragment(self,urlToStrip): urlToStripParts = urlparse.urlsplit(urlToStrip) protocol = urlToStripParts[0] server = urlToStripParts[1] path = urlToStripParts[2] query = urlToStripParts[3] fragment = urlToStripParts[4] return protocol + "://" + server + path + query def addUrlToHistory(self,urlToAdd): strStrippedURL = self.stripFragment(urlToAdd) urlToAddParts = urlparse.urlsplit(urlToAdd) protocol = urlToAddParts[0] server = urlToAddParts[1] path = urlToAddParts[2] query = urlToAddParts[3] fragment = urlToAddParts[4] dUrlParts = {} dUrlParts['url'] = strStrippedURL dUrlParts['protocol'] = protocol dUrlParts['server'] = server dUrlParts['path'] = path dUrlParts['query'] = query dUrlParts['fragment'] = fragment self.uniqueLinks[strStrippedURL] = dUrlParts #Add url without fragment to list of parsed urls self.parsedurls.append(strStrippedURL) #Check if URL exists. Returns status and content type. def urlIsOk(self,urlToCheck): try: #split the url to get the request item urlToCheckParts = urlparse.urlsplit(urlToCheck) protocol = urlToCheckParts[0] server = urlToCheckParts[1] path = urlToCheckParts[2] fragment = urlToCheckParts[4] #Skip links where protocol is one of skippedProtocols if protocol in self.skippedProtocols: return (True, "unknown", 0) #Skip links to other sites if len(server) > 0: if urlToCheck.find(self.rootUrl) == -1: return (False, "unknown", 0) #Skip same page links if len(fragment) > 0: if self.stripFragment(urlToCheck) in self.parsedurls: return (False, "unknown", 0) #Check url header httpObj = httplib.HTTPConnection(server, 80) httpObj.connect() httpObj.putrequest('HEAD', path) httpObj.putheader('Accept', '*/*') httpObj.putheader('User-Agent', self.useragentSelf) httpObj.endheaders() response = httpObj.getresponse() contentType = response.getheader("content-type") httpObj.close(); if response.status != 200: if response.status == 301: #moved permanently - read location return self.urlIsOk(response.getheader("location")) if response.status == 302: #handle redirect if self.bParseRedirects == True: return (True, contentType, 200) else: return self.urlIsOk(response.getheader("location")) else: #server error message return (False, contentType, response.status) else: #Server reports url is OK. return (True, contentType, 200) except Exception: print traceback.format_exc() return (False, "unknown", 999) def checkUrl(self,urlToCheck): result = self.urlIsOk(urlToCheck) if result[0]: #determine if link is crawlable if self.isParsable(result[1]): return True else: return False else: return False #get html for a page def getContent(self,url): try: contents = urllib2.urlopen(url).read() return contents except: return "" #Get data def gatherLinks(self,currentUrl, currentlevel): self.iNumberOfTimesCalled = self.iNumberOfTimesCalled + 1 if len(self.uniqueLinks) > self.iLinksGathered: self.iLinksGathered = len(self.uniqueLinks) self.outputProgress(str(self.iLinksGathered)+" unique urls have been gathered\r") #Check if URL already parsed if not (self.stripFragment(currentUrl) in self.parsedurls): #check if url is ok if self.checkUrl(currentUrl): #Get doc contents = self.getContent(currentUrl) #add title and url to list self.addUrlToHistory(currentUrl) #recurse gatheredlinks = htmldata.urlextract(contents, currentUrl) for u in gatheredlinks: self.gatherLinks(u.url, currentlevel) else: pass #print currentUrl+" is not valid" return if __name__ == '__main__': #start script crawlsiteObj = crawlsite("") uniqueLinks = crawlsiteObj.crawl() print "\n\nUnique Links\n" for u in uniqueLinks: print u