Sunday, 17 July 2016

Beautiful Soup for Crawling

Beautiful Soup for web crawling


 import urllib2  
 from bs4 import BeautifulSoup  
 import requests  
 deptCodes = ["AE", "AG", "AR", "BT", "CH", "CM", "CE", "CS", "EE", "EC", "MG", "HS", "IM", "MM", "ME", "MT", "MI", "NA", "MP",  
       "ED", "CR", "MS", "N2", "PK", "RE", "RT", "RD", "GS", "IT", "RJ", "RG", "ID", "MD", "BS", "EF", "ES", "NT", "WM",  
       "SM"  
       ]  
 print len(deptCodes)  
 outfile = file("out.txt", "w")  
 f = []  
 for dept in deptCodes:  
   print dept  
   fetchUrl = "http://www.iitkgp.ac.in/commdir3/list.php?division=3&deptcode=" + dept  
   try:  
     page = urllib2.urlopen(fetchUrl)  
   except:  
     break  
   htmlDocs = page.read()  
   soup = BeautifulSoup(htmlDocs)  
   links = soup('a')  
   facPage = "fac-profiles"  
   for link in links:  
     if link.has_attr('href'):  
       if link['href'].find(facPage) >= 0:  
         l = str(link['href'])  
         f.append(l)  
         outfile.write("http://www.iitkgp.ac.in" + link['href'] + "\n")  
   #for elt in links:  
     #print elt  
 print len(f)  
 print len(list(set(f)))  
 outfile.close()  
Share:

0 comments:

Post a Comment

Contact Me

Name

Email *

Message *

Popular Posts

Blog Archive