Hemanth's Scribes

python

Web Crawler with Python Twisted

Author Photo

Hemanth HM

Thumbnail

A simple HTTP crawler built with Python Twisted.

#!/usr/bin/env python
from twisted.web.client import getPage
from BeautifulSoup import BeautifulSoup
from twisted.python import log
from twisted.internet import defer, task
import re

def parallel(iterable, count, callable, *args, **named):
    coop = task.Cooperator()
    work = (callable(elem, *args, **named) for elem in iterable)
    return defer.DeferredList([coop.coiterate(work) for i in xrange(count)])

def union(p, q):
    for e in p:
        if e not in q:
            print e
            q.append(e)

def extractLinks(html):
    soup = BeautifulSoup(html)
    soup.prettify()
    return [
        str(anchor['href']) 
        for anchor in soup.findAll('a', attrs={'href': re.compile("^http://")})
        if anchor['href']
    ]

def crawlPage(url, urlList):
    d = getPage(url)
    d.addCallback(extractLinks)
    d.addCallback(union, urlList)
    d.addErrback(log.err)
    return d

def main(reactor, *args):
    urls = list(args)
    return parallel(urls, len(urls), crawlPage, urls)

if __name__ == '__main__':
    task.react(main, ["http://h3manth.com", "http://www.test.com"])

Requires: PyOpenSSL and Twisted 12.3+

#python#twisted#web-crawler
Author Photo

About Hemanth HM

Hemanth HM is a Sr. Machine Learning Manager at PayPal, Google Developer Expert, TC39 delegate, FOSS advocate, and community leader with a passion for programming, AI, and open-source contributions.