Beautiful Soup for web crawling
import urllib2
from bs4 import BeautifulSoup
import requests
deptCodes = ["AE", "AG", "AR", "BT", "CH", "CM", "CE", "CS", "EE", "EC", "MG", "HS", "IM", "MM", "ME", "MT", "MI", "NA", "MP",
"ED", "CR", "MS", "N2", "PK", "RE", "RT", "RD", "GS", "IT", "RJ", "RG", "ID", "MD", "BS", "EF", "ES", "NT", "WM",
"SM"
]
print len(deptCodes)
outfile = file("out.txt", "w")
f = []
for dept in deptCodes:
print dept
fetchUrl = "http://www.iitkgp.ac.in/commdir3/list.php?division=3&deptcode=" + dept
try:
page = urllib2.urlopen(fetchUrl)
except:
break
htmlDocs = page.read()
soup = BeautifulSoup(htmlDocs)
links = soup('a')
facPage = "fac-profiles"
for link in links:
if link.has_attr('href'):
if link['href'].find(facPage) >= 0:
l = str(link['href'])
f.append(l)
outfile.write("http://www.iitkgp.ac.in" + link['href'] + "\n")
#for elt in links:
#print elt
print len(f)
print len(list(set(f)))
outfile.close()
0 comments:
Post a Comment