1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
| #!/usr/bin/env python
# -*- coding: utf-8 -*-
"""This is a liner implementation of a simple HTTP crawler.
This is crawler crawlers a given URL till a specified limit,
or till limit tends to infinity.
TODO :
1. import robotparser and parse robots.txt
2. Write the URL to DB using sqllite.
3. Content type validation using response.info().headers
"""
import urllib2
import socket
from lxml.html import parse
import argparse
import sys
import re
socket.setdefaulttimeout(10)
class Spidy:
"""Main spider class, public method crawl"""
def __init__(self, url):
self.seed = url
self.failed = []
self.crawled = []
def __union(self, p, q):
"""list(set(a) | set(b))"""
for e in q:
if e not in p:
p.append(e)
def __extractLinks(self, page):
""" Extract hrefs """
dom = parse(page).getroot()
dom.make_links_absolute()
links = dom.cssselect('a')
return [link.get('href') for link in links if link.get('href')]
def crawl(self, limit=float('inf')):
""" Crawls the webpage,
optional param limit.
"""
tocrawl = [self.seed]
while tocrawl and len(self.crawled) < limit:
page = tocrawl.pop()
print page # Printing as of now for redirection.
if page not in self.crawled:
try:
self.__union(tocrawl, self.__extractLinks(page))
self.crawled.append(page)
except Exception as e:
print e
self.failed.append([page, e]) # Failed! write to DB.
pass
return self.crawled
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Spidy a simple web crawler')
parser.add_argument('-u', '--url', help='URL to crawl',required=True)
parser.add_argument('-l', '--limit', help='Crawlling limit', required=False)
args = parser.parse_args()
url = args.url
limit = args.limit
if re.match("^https?://", url):
try:
urllib2.urlopen(url)
except IOError:
print "Not a real URL"
sys.exit(0)
else:
print "Sorry only http or https urls are accepted as of now"
sys.exit(0)
if not url.endswith("/"):
url+="/" # Needs a trailing slash.
spider = Spidy(url)
spider.crawl() if limit==None else spider.crawl(limit)
|