Source code for crawler

#!/usr/bin/env python
# encoding: utf-8
"""
crawler.py

Created by 徐 光硕 on 2011-11-24.
Copyright (c) 2011 __MyCompanyName__. All rights reserved.
"""

import requests
from requests import Response, RequestException
import random
import re
import time
try :  
    import json  
except ImportError:  
    import simplejson as json
from BeautifulSoup import BeautifulSoup
from pprint import pprint
from urlparse import urlparse
from utils import ThreadPool

[docs]class SearchResults: '''Search Results Object''' def __init__(self, q, total, lists, malls, ads, cats, keywords): self.q = q self.total = total self.lists = lists self.malls = malls self.ads = ads self.cats = cats self.keywords = keywords def __str__(self): return "<%s: q=%s, total=%d, lists=%d, malls=%s, ads=%d, cats=%d, keywords=%d>" %(self.__class__.__name__, self.q, self.total, len(self.lists), len(self.malls), len(self.ads), len(self.cats), len(self.keywords)) __repr__ = __str__
[docs]class Cat: '''The category object in search results''' def __init__(self, id, url, title, amount): self.id = id self.url = url self.title = title self.amount = amount def __str__(self): return "<%s: id=%d, title=%s, amount=%d>" %(self.__class__.__name__, self.id, self.title, self.amount) __repr__ = __str__
[docs]class Item: '''The item object in search results''' def __init__(self, id, summary, photo, attrs, legends=None): self.id = id self.photo = photo self.summary = summary self.attrs = attrs self.legends = legends def __str__(self): if self.legends: lgs = ','.join(self.legends) lgs = None return "<%s: id=%d, summary=%s, img=%s, attrs=%s, legends=%s>" %(self.__class__.__name__, self.id, self.summary, self.photo, self.attrs, lgs) __repr__ = __str__
[docs]class ItemAttr: '''The item's attributs object in search results''' def __init__(self, price, transaction, shipment_fee=None, location=None, seller=None, seller_id=None): self.price = price self.transaction = transaction self.shipment_fee = shipment_fee self.seller = seller self.seller_id = seller_id self.location = location def __str__(self): return "<%s: price=%.2f, transaction=%d, shipment_fee=%.2f, location=%s, seller=%s, seller_id=%d>" %(self.__class__.__name__, self.price, self.transaction, self.shipment_fee, self.location, self.seller, self.seller_id) __repr__ = __str__
[docs]class Crawler: '''General Crawler for Taobao Site.''' def __init__(self, proxies=[]): self.proxies = proxies self.search_url = 'http://s.taobao.com/search'
[docs] def search(self, keyword): '''Get search results for a specific keyword''' if type(keyword) != unicode: q = keyword.decode('utf-8') req = self.fetch(self.search_url, {'q':q.encode('gbk')}) if not req: return None html = req.content.decode('gbk').encode('utf-8') soup = BeautifulSoup(html) cats = self.cats_parser(soup) keywords = self.keywords_parser(soup) mall_items = self.mall_items_parser(soup) total = int(soup.find('div', attrs={'class':'user-easy'}).find('a').string) lists = self.lists_parser(soup) ads = self.ads_parser(soup) return SearchResults(keyword, total, lists, mall_items, ads, cats, keywords)
def cats_parser(self, soup): return [Cat(int(re.findall('cat\=(\d+)', dd.a['href'])[0]), dd.a['href'], dd.a['title'].encode('utf-8'), int(dd.contents[1][1:-1])) for dd in soup.find('div', id='J_AllCates').findAll('dd')] def keywords_parser(self, soup): return [a.string.encode('utf-8') for a in soup.find('dl', attrs={'class':'related-search'}).findAll('a')] def mall_items_parser(self, soup): items = [] for li in soup.find('div', id='J_mall').findAll('li', attrs={'class':'list-item'}): items.append(self.list_item_parser(li)) return items def lists_parser(self, soup): items = [] for li in soup.find('ul', attrs={'class':'list-view-enlarge-4pic with-shipment lazy-list-view'}).findAll('li', attrs={'class':'list-item'}): items.append(self.list_item_parser(li)) return items def ads_parser(self, soup): items = [] url = 'http:%s%d'%(re.findall(r'\'(//tmatch.simba.taobao.com/[^^]*?)\'', soup.find('div', 'col-main').script.string)[0], int(time.time())) req = self.fetch(url) items = [Ad(**item) for item in json.loads(re.findall(r'__p4p_sidebar__ \=(\[[^^]*?\])\n', req.content)[0])] return items print soup.findAll('ul', attrs={'class':'p4p-list'}) for ul in soup.findAll('ul', attrs={'class':'p4p-list'}): for li in ul.findAll('li', attrs={'class':'item'}): items.append(self.item_parser(li)) return items def item_parser(self, li): itemAttr = ItemAttr( float(li.find('div', attrs={'class':'price'}).strong.string), self.parse_float(li.find('div', attrs={'class':'transaction'}).string) ) url = li.find('div', attrs={'class':'title'}).a['href'] req = self.fetch(url) if req: item_id = (re.findall(r'id\=(\d+)', req.url)[0]) else: item_id = None item = Item(item_id, li.find('div', attrs={'class':'title'}).a['title'], li.find('div', attrs={'class':'pic'}).img['src'], itemAttr) return item def list_item_parser(self, li): # parse attributes attrs = li.find('ul', attrs={'class':'attribute'}) attr_price = attrs.find('li', attrs={'class':'price'}) attr_shipment = attrs.find('li', attrs={'class':'shipment'}) attr_seller = attrs.find('li', attrs={'class':'seller'}) item_attr = ItemAttr( float(attr_price.em.string), int(re.findall('\d+', attr_price.span.string)[0]), self.parse_float(attr_shipment.find('span', attrs={'class':'fee'}).string), attr_shipment.find('span', attrs={'class':'loc'}).string.strip().encode('utf-8'), attr_seller.a.string.encode('utf-8'), int(re.findall('user_number_id\=(\d+)', attr_seller.a['href'])[0]) ) # parse legends legends = li.find('div', attrs={'class':'legend2'}) lgs = [] for a in legends.findAll('a'): if 'class' in a and a['class'] == 'mall-icon': lgs.append('商城') elif a.string: lgs.append(a.string.encode('utf-8')) # parse summary desc_a = li.find('h3', attrs={'class':'summary'}).a try: desc_img = li.find('div', attrs={'class':re.compile(r".*\bphoto\b.*")}).find('img')['src'].encode('utf-8') except KeyError: desc_img = li.find('div', attrs={'class':re.compile(r".*\bphoto\b.*")}).find('img')['data-ks-lazyload'].encode('utf-8') item = Item(int(re.findall('id\=(\d+)', desc_a['href'])[0]), desc_a['title'].encode('utf-8'), desc_img, item_attr, lgs) return item
[docs] def get_top_keywords(self, cats=None, parent=None, up=True): '''Get top keywords for all the categories''' if not cats: cats = self.get_cats() if not cats: return [] threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5) for cat in cats: threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up) cats = threadPool.killAllWorkers(None) return cats
def cat_top_keywords_thread(self, cat, parent, up): if 'children' in cat and cat['children']: cat['children'] = self.get_top_keywords(cat['children'], cat['id'], up) if 'level' in cat: if cat['level'] == 2: cat['keywords'] = self.cat_top_keywords(cat['id'], '', up) elif cat['level'] == 3 and parent: cat['keywords'] = self.cat_top_keywords(parent, cat['level'], up) return cat
[docs] def cat_top_keywords(self, cat, level3='', up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' #print 'CAT:%s, level:%s'%(str(cat), str(level3)) #print 'OFFSET: %d'%offset response = [] if not offsets or offset==0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(str(cat), str(level3), 'true' if up else '', offset) rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content) response = self.parse_cat_top_keywords(soup, offset) if offset==0: offsets = self.get_cat_top_keywords_pages(soup, offset) #print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets)<=5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(str(cat), str(level3), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets)-1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) #print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) #print offsets if not offsets: continue response += self.cat_top_keywords(cat, level3, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset return response
def get_cat_top_keywords_pages(self, soup, offset): aa = soup.find('div', attrs={'class':'pagination'}).findAll('a') if not aa: return [] return [30*(int(a.string)-1) for idx, a in enumerate(aa) if a.string and 30*(int(a.string)-1)>offset] def parse_cat_top_keywords(self, soup, offset=0): if not soup: return [] if type(soup) == Response: soup = BeautifulSoup(soup.content) offset = soup.config['page_offset'] ks = [] for idx, tr in enumerate(soup.find('table', attrs={'class':'textlist'}).find('tbody').findAll('tr')): name = tr.td.find('span', attrs={'class':'title'}).a.string.encode('utf-8') focus = int(tr.td.find('span', attrs={'class':'focus'}).em.string) grows = tr.td.findAll('span', attrs={'class':'grow'}) multi = -1 if 'down' in grows[0].i['class'] else 1 grow_percent = int(grows[0].em.string[:-1]) * multi multi = -1 if 'down' in grows[1].i['class'] else 1 grow_pos = int(re.findall('\d+', grows[1].em.string)[0]) * multi ks.append({'pos':idx+offset+1, 'name':name, 'focus':focus, 'grow_percent':grow_percent, 'grow_pos':grow_pos}) return ks
[docs] def get_cats(self): '''Get top keywords categories''' start_url = 'http://top.taobao.com/index.php?from=tbsy' rs = self.fetch(start_url) if not rs: return None soup = BeautifulSoup(rs.content) cats = [{'id':'TR_%s'%li['id'].encode('utf-8').upper(), 'title':li.a.text.encode('utf-8')} for li in soup.find('div', id='nav').findAll('li') if li['id']!='index'] threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5) for cat in cats: threadPool.run(self.get_cats_thread, callback=None, cat=cat) cats = threadPool.killAllWorkers(None) print 'got cats' return cats
def get_cats_thread(self, cat): subcats = self.get_sub_cats('http://top.taobao.com/level2.php?cat=%s'%cat['id'], 'cat', 2) if len(subcats) == 1: cat['children'] = self.get_sub_cats_thread(subcats[0]) return cat threadPool = ThreadPool(len(subcats) if len(subcats)<=5 else 5) for sc in subcats: threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc) cat['children'] = threadPool.killAllWorkers(None) return cat def get_sub_cats_thread(self, sc): return {'id':sc['id'], 'title':sc['title'], 'children':self.get_sub_cats('http://top.taobao.com/level3.php?cat=%s'%sc['id'], 'level3', 3), 'level':sc['level']} def get_sub_cats(self, url, param, level): #print 'get_sub_cats:%s'%url rs = self.fetch(url) if not rs: return None cats = [] soup = BeautifulSoup(rs.content) div = soup.find('div', id='categories') if not div: return None for dd in div.findAll('dd'): lc = dict([q.split('=') for q in urlparse(dd.a['href']).query.split('&')])[param] try: lc = int(lc) except ValueError: lc = lc.encode('utf-8') cats.append({'id':lc, 'title':dd.a.string.encode('utf-8'), 'level':level}) return cats def fetch(self, url, params=None, method='get', config={}): proxy = None if self.proxies: proxy = {'http':random.choice(self.proxies)} retry = 5 else: retry = 0 while retry>=0: try: if method=='get': req = requests.get(url, params=params, proxies = proxy, config=config) else: req = requests.post(url, data=params, proxies=proxy, config=config) return req except RequestException: retry -= 1 continue return None def parse_float(self, string): return float(re.findall(r'[+-]? *(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][+-]?\d+)?', string)[0])