# -*- coding: utf-8 -*- from lxml import etree from multiprocessing.dummy import Pool as threadpool import requests import json import sys reload(sys) sys.setdefaultencoding('utf-8') def towrite(item): f.writelines(u'回帖人: ' + str(item['user_name']) + '\n') f.writelines(u'回帖内容: ' + str(item['topic_reply_content']) + '\n') f.writelines(u'回帖时间: ' + str(item['topic_reply_time']) + '\n\n') def spider(url): html = requests.get(url) selector = etree.HTML(html.text) # 采用requests获得一个页面的源码,并进行xpath content_field = selector.xpath('//div[@class="l_post j_l_post l_post_bright "]') # xpath获得一个页面不一样回贴的列表,先抓大后抓小 item = {} # 获得的内容放入字典中 for each in content_field: # 后抓小,开始进行解析 print each.xpath('@data-field')[0] # json 文件 print json.loads(each.xpath('@data-field')[0]) # 变为字典 reply_info = json.loads(each.xpath('@data-field')[0]) # 经过xpath获得一个相应内容的列表,并把列表内容解析获得一个字典 author = reply_info['author']['user_name'] #获得字典内容 reply_time = reply_info['content']['date'] content = each.xpath('div[@class="d_post_content_main"]/div/cc/div/text()')[0].replace(' ','') # 用xpath获得文本内容 print content print reply_time print author item['user_name'] = author # 保存到 item字典中 item['topic_reply_content'] = content item['topic_reply_time'] = reply_time towrite(item) # 写入文件 if __name__ == '__main__': pool = threadpool(1) # 采用多线程爬取,定义进程池 f = open('content.txt', 'a') page = [] for i in range(1,2): # 爬取的页数,存入列表中 newpage = 'http://tieba.baidu.com/p/3522395718?pn=' + str(i) page.append(newpage) results = pool.map(spider, page) # 采用map函数爬取 pool.close() f.close()
一个百度贴吧的爬虫,并只在内容。html