博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python 编码问题
阅读量:6695 次
发布时间:2019-06-25

本文共 2278 字,大约阅读时间需要 7 分钟。

__author__ = 'dell'# -*- coding: utf-8 -*-from lxml import etreeimport urllib2import timedef loadCategory():    res = {}    f_txt = open('catetory.txt')    while True:        line = f_txt.readline()        if not line:            break        line = line.strip().decode('gbk')        tokens = line.split('\t')        if len(tokens) < 2:            continue        key = tokens[1].strip()        print key        val = tokens[0].strip()        res[key] = val    return resdef loadCity():    res = {}    f_txt = open('city.txt')    while True:        line = f_txt.readline()        if not line:            break        line = line.strip().decode('gbk')        tokens = line.split(':')        if len(tokens) < 2:            continue        key = tokens[0].strip()        val = tokens[1].strip()        if key in res.keys():            print 'repeated city:', key        else:            res[key] = val    return rescats = loadCategory()# for key in cats.keys():#     print key, cats[key]citys = loadCity()# for key in citys.keys():#     print key, citys[key]print 'length of category:', len(cats)print 'length of citys:', len(citys)print 'generating urls ... ...'standard = 'http://www.dianping.com/search/category/%s/%s'def gen(cateName):    res = []    if cateName in cats.keys():        catId = cats[cateName]        for cityName in citys.keys():            cityId = citys[cityName]            url = standard % (cityId, catId)            res.append((url, cityName))        return res    else:        return resdef getHtml(url):    request = urllib2.Request(url)    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0')    doc = urllib2.urlopen(request, timeout=45).read().decode('utf8')    return docdef getFetchHour(count):    return count * 5.0 / 3600def getFetchDay(count):    return (count * 5.0 / 3600) / 24urllist = gen(u'购物')print len(urllist)sum = 0for u in urllist:    html = getHtml(u[0])    tree = etree.HTML(html)    hnc = tree.xpath("//span[@class='Color7']")    for hn in hnc:        strnum = hn.text.replace('(', '').replace(')', '')        print u[1], strnum        sum += int(strnum)    # time.sleep(5)print sumprint 'fetch time (hour) :' + str(getFetchHour(sum))print 'fetch time (day) :' + str(getFetchDay(sum))

 

转载地址:http://zlpoo.baihongyu.com/

你可能感兴趣的文章
SpringMVC视图解析器概述
查看>>
SQL Server 自动化运维系列 - 监控磁盘剩余空间及SQL Server错误日志(Power Shell)...
查看>>
English Phonetic Spelling Alphabet
查看>>
linux下访问ftp服务器和文件传输
查看>>
极力推荐python初学者使用wingIDE
查看>>
[NOIP2014] 普及组
查看>>
HDU 1017 A Mathematical Curiosity【水,坑】
查看>>
ASP.NET JsonHelper类
查看>>
加密算法整理概述
查看>>
gridlayout小例子
查看>>
微信小程序 + Bmob后端云
查看>>
EntityFramework之原始查询如何查询未映射的值,你又知道多少?
查看>>
怎么使用T-sql生成两位字母
查看>>
自己动手编译octave 4.0.0
查看>>
【Java自学笔记系列:数组】
查看>>
Django之url路由
查看>>
二叉树的所有路径
查看>>
java反射机制的原理与简单使用
查看>>
CentOs
查看>>
Dockerfile编写(备份)
查看>>