A simple HTTP crawler built with Python Twisted.
#!/usr/bin/env python
from twisted.web.client import getPage
from BeautifulSoup import BeautifulSoup
from twisted.python import log
from twisted.internet import defer, task
import re
def parallel(iterable, count, callable, *args, **named):
coop = task.Cooperator()
work = (callable(elem, *args, **named) for elem in iterable)
return defer.DeferredList([coop.coiterate(work) for i in xrange(count)])
def union(p, q):
for e in p:
if e not in q:
print e
q.append(e)
def extractLinks(html):
soup = BeautifulSoup(html)
soup.prettify()
return [
str(anchor['href'])
for anchor in soup.findAll('a', attrs={'href': re.compile("^http://")})
if anchor['href']
]
def crawlPage(url, urlList):
d = getPage(url)
d.addCallback(extractLinks)
d.addCallback(union, urlList)
d.addErrback(log.err)
return d
def main(reactor, *args):
urls = list(args)
return parallel(urls, len(urls), crawlPage, urls)
if __name__ == '__main__':
task.react(main, ["http://h3manth.com", "http://www.test.com"])
Requires: PyOpenSSL and Twisted 12.3+
#python#twisted#web-crawler
About Hemanth HM
Hemanth HM is a Sr. Machine Learning Manager at PayPal, Google Developer Expert, TC39 delegate, FOSS advocate, and community leader with a passion for programming, AI, and open-source contributions.