import os
import sys
from crawlsite import crawlsite
import urlparse
import cgi
import httplib
import time
import datetime
class createSiteMap:
strSiteMap = ""
def __init__(self,urltocrawl = ""):
if urltocrawl.strip()=="":
if len(sys.argv) < 2:
self.url = raw_input("Supply url to crawl: ")
else:
self.url = sys.argv[1]
else:
self.url = urltocrawl.strip()
crawlsiteObj = crawlsite(self.url)
dUniqueUrls = crawlsiteObj.crawl()
self.BuildSiteMap(dUniqueUrls)
self.WriteSiteMap()
def GetSiteMap(self):
return self.strSiteMap
def BuildSiteMap(self,dUniqueUrls):
self.strSiteMap = '\n'
self.strSiteMap += '\n'
self.strSiteMap += '\t\n'
self.strSiteMap += '\t\t'
self.strSiteMap += self.url
self.strSiteMap += '\n'
self.strSiteMap += '\t\n'
self.strSiteMap += self.BuildSiteMapURLS(dUniqueUrls)
self.strSiteMap += '\n'
def BuildUniqueURLS(self,dUniqueUrls):
lUniqueURLS = []
for u in dUniqueUrls:
strStrippedURL = dUniqueUrls[u]['url'].lower()
protocol = dUniqueUrls[u]['protocol'].lower()
server = dUniqueUrls[u]['server'].lower()
path = dUniqueUrls[u]['path'].lower()
query = dUniqueUrls[u]['query'].lower()
fragment = dUniqueUrls[u]['fragment']
"""
print "*******************"
print "url =" + strStrippedURL
print "protocol =" + protocol
print "server =" + server
print "path =" + path
print "query =" + query
print "fragment =" + fragment
"""
lUniqueURLS.append(strStrippedURL)
return lUniqueURLS
def ListSizeCompare(self,lx, ly):
lxLength = len(lx)
lyLength = len(ly)
if lxLength>lyLength:
return -1
elif lxLength==lyLength:
return 0
else:
return 1
def WriteSiteMap(self):
urlToCheckParts = urlparse.urlsplit(self.url)
server = urlToCheckParts[1].lower()
strSiteMapName = server + "-sitemap.xml"
siteMapFile = open(strSiteMapName, "w") #Opens the file again, this time in write-mode
siteMapFile.write(self.strSiteMap)
siteMapFile.close()
def BuildSiteMapURLS(self,dUniqueUrls):
lUniqueURLS = self.BuildUniqueURLS(dUniqueUrls)
lUniqueURLS.sort(self.ListSizeCompare)
strSiteMapURLS = ""
for url in lUniqueURLS:
strSiteMapURLS += '\t\n'
strSiteMapURLS += '\t\t'
strSiteMapURLS += url
strSiteMapURLS += '\n'
strSiteMapURLS += '\t\n'
return strSiteMapURLS
if __name__ == '__main__':
createSiteMapObj = createSiteMap()
strSiteMap = createSiteMapObj.GetSiteMap()
print "\n\nSiteMap"
print strSiteMap