import os import sys from crawlsite import crawlsite import urlparse import cgi import httplib import time import datetime class createSiteMap: strSiteMap = "" def __init__(self,urltocrawl = ""): if urltocrawl.strip()=="": if len(sys.argv) < 2: self.url = raw_input("Supply url to crawl: ") else: self.url = sys.argv[1] else: self.url = urltocrawl.strip() crawlsiteObj = crawlsite(self.url) dUniqueUrls = crawlsiteObj.crawl() self.BuildSiteMap(dUniqueUrls) self.WriteSiteMap() def GetSiteMap(self): return self.strSiteMap def BuildSiteMap(self,dUniqueUrls): self.strSiteMap = '\n' self.strSiteMap += '\n' self.strSiteMap += '\t\n' self.strSiteMap += '\t\t' self.strSiteMap += self.url self.strSiteMap += '\n' self.strSiteMap += '\t\n' self.strSiteMap += self.BuildSiteMapURLS(dUniqueUrls) self.strSiteMap += '\n' def BuildUniqueURLS(self,dUniqueUrls): lUniqueURLS = [] for u in dUniqueUrls: strStrippedURL = dUniqueUrls[u]['url'].lower() protocol = dUniqueUrls[u]['protocol'].lower() server = dUniqueUrls[u]['server'].lower() path = dUniqueUrls[u]['path'].lower() query = dUniqueUrls[u]['query'].lower() fragment = dUniqueUrls[u]['fragment'] """ print "*******************" print "url =" + strStrippedURL print "protocol =" + protocol print "server =" + server print "path =" + path print "query =" + query print "fragment =" + fragment """ lUniqueURLS.append(strStrippedURL) return lUniqueURLS def ListSizeCompare(self,lx, ly): lxLength = len(lx) lyLength = len(ly) if lxLength>lyLength: return -1 elif lxLength==lyLength: return 0 else: return 1 def WriteSiteMap(self): urlToCheckParts = urlparse.urlsplit(self.url) server = urlToCheckParts[1].lower() strSiteMapName = server + "-sitemap.xml" siteMapFile = open(strSiteMapName, "w") #Opens the file again, this time in write-mode siteMapFile.write(self.strSiteMap) siteMapFile.close() def BuildSiteMapURLS(self,dUniqueUrls): lUniqueURLS = self.BuildUniqueURLS(dUniqueUrls) lUniqueURLS.sort(self.ListSizeCompare) strSiteMapURLS = "" for url in lUniqueURLS: strSiteMapURLS += '\t\n' strSiteMapURLS += '\t\t' strSiteMapURLS += url strSiteMapURLS += '\n' strSiteMapURLS += '\t\n' return strSiteMapURLS if __name__ == '__main__': createSiteMapObj = createSiteMap() strSiteMap = createSiteMapObj.GetSiteMap() print "\n\nSiteMap" print strSiteMap