__author__ = 'dell'# -*- coding: utf-8 -*-from lxml import etreeimport urllib2import timedef loadCategory(): res = {} f_txt = open('catetory.txt') while True: line = f_txt.readline() if not line: break line = line.strip().decode('gbk') tokens = line.split('\t') if len(tokens) < 2: continue key = tokens[1].strip() print key val = tokens[0].strip() res[key] = val return resdef loadCity(): res = {} f_txt = open('city.txt') while True: line = f_txt.readline() if not line: break line = line.strip().decode('gbk') tokens = line.split(':') if len(tokens) < 2: continue key = tokens[0].strip() val = tokens[1].strip() if key in res.keys(): print 'repeated city:', key else: res[key] = val return rescats = loadCategory()# for key in cats.keys():# print key, cats[key]citys = loadCity()# for key in citys.keys():# print key, citys[key]print 'length of category:', len(cats)print 'length of citys:', len(citys)print 'generating urls ... ...'standard = 'http://www.dianping.com/search/category/%s/%s'def gen(cateName): res = [] if cateName in cats.keys(): catId = cats[cateName] for cityName in citys.keys(): cityId = citys[cityName] url = standard % (cityId, catId) res.append((url, cityName)) return res else: return resdef getHtml(url): request = urllib2.Request(url) request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0') doc = urllib2.urlopen(request, timeout=45).read().decode('utf8') return docdef getFetchHour(count): return count * 5.0 / 3600def getFetchDay(count): return (count * 5.0 / 3600) / 24urllist = gen(u'购物')print len(urllist)sum = 0for u in urllist: html = getHtml(u[0]) tree = etree.HTML(html) hnc = tree.xpath("//span[@class='Color7']") for hn in hnc: strnum = hn.text.replace('(', '').replace(')', '') print u[1], strnum sum += int(strnum) # time.sleep(5)print sumprint 'fetch time (hour) :' + str(getFetchHour(sum))print 'fetch time (day) :' + str(getFetchDay(sum))