转自:https://blog.csdn.net/qq_36381299/article/details/80634451python
前言:正则表达式
根据搜索相关的职位,获取职位数量,由职位数量获得职位相关页码连接,再由相关页码连接得到每一个职位连接,最后由职位连接获取详细的职位描述。以上得到连接和职位描述由正则表达式完成。浏览器
环境:win7 、pycharm、python二、app
所用到的库:urllib2 、 re、urllib、time 、jieba、matplotlib、wordcloud、numpy、PILdom
文件组成:函数
main.py ----主要函数文件包括获取页码连接、获取每页职位连接、获取职位描述、爬取信息保存职位描述为txt文本字体
zhaopin_wordcloud.py ----根据保存文本信息生成词云编码
mysh.ttf ----为生成词云准备的字体文件url
info.txt ----保存职位描述为txt文本spa
代码以下:
main.py
#coding:utf-8
import urllib2
import urllib import re import time #获取页码连接 def getpagelist(name): url = "https://sou.zhaopin.com/jobs/searchresult.ashx?" # 模拟浏览器头部 headers = { "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1" } word = {"kw": name} # 相关职位 word = urllib.urlencode(word) # 编码成字符串 url = url + word # 拼接url request = urllib2.Request(url, headers=headers) # 发起请求 request.add_header("Connection", "keep-alive") # 一直活着 response = urllib2.urlopen(request) # 打开请求 data = response.read() # 读取数据 restr = "<em>(\\d+)</em>" # 正则表达式 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) # 寻找页面全部信息 numbers = mylist[0] numbers = eval(numbers)#将职位数转化为数据 zhao_numbers = numbers # 职位的数量 zhao_list = [] # 空列表 print "++++++++++++++++" print zhao_numbers # for i in range(zhao_numbers//50): # print if zhao_numbers % 60 == 0: # 生成页面列表 for i in range(zhao_numbers // 60): #智联招聘每页有60个职位 职位总数整除60就是有几页 zhao_list.append( #添加连接到列表 "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1)) else: for i in range(zhao_numbers // 60 + 1): zhao_list.append( "http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B5%8E%E5%8D%97&kw=" + name + "&p=" + str(i + 1)) return zhao_list #获取每页中的职位连接 def get_url_list(url): #模拟浏览器 headers = { "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1" } request = urllib2.Request(url, headers=headers) # 发起请求 request.add_header("Connection", "keep-alive") # 一直活着 response = urllib2.urlopen(request) # 打开请求 data = response.read() # 读取数据 #print data restr = ur"<a style=\"font-weight: bold\" par=\"ssidkey=y&ss=201&ff=03&sg=.*?;so=.*?\" href=\"(\bhttp[\s\S]..\bjobs.\w+.\w+.\w+.\w+)" # 正则表达式,()匹配内容 regex = re.compile(restr, re.IGNORECASE) mylist = regex.findall(data) # 寻找页面全部信息 urllist = [] for list in mylist: urllist.append(list) return urllist '''#错误的代码 restr = "http://jobs.zhaopin.com/([\s\S]*?)" # 正则表达式 regex = re.compile(restr, re.IGNORECASE) tableurllist = regex.findall(tablestr) # 寻找页面全部信息 urllist = [] for list in tableurllist: urllist.append("http://jobs.zhaopin.com/"+list+".htm") return urllist ''' #获取职位描述信息 def get_zhiwei(url): # 模拟浏览器 headers = { "User-Agent": "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/60.0.1" } request = urllib2.Request(url, headers=headers) # 发起请求 request.add_header("Connection", "keep-alive") # 一直活着 response = urllib2.urlopen(request) # 打开请求 data = response.read() # 读取数据 restr = "<div class=\"tab-inner-cont\">([\s\S]*?)<b>" # 正则表达式 regex = re.compile(restr, re.IGNORECASE) info = regex.findall(data) info_page = info[0].decode('utf-8').strip().replace("<p>", "").replace("</p>", "")#去除标签替换为空格 #info_page = info[0].decode('utf-8') return info_page #写入文件 def wirtetxt(info): file = open('info.txt',"ab+") file.write((info).encode('utf-8')) file.close() #file.write((info).encode('utf-8')) #num=1 #savefilepath = "workinfo.txt" #savefile =open(savefilepath,"wb") zhao_list = getpagelist("python")#页码连接 for line in zhao_list: #print line # 打印连接 urllist= get_url_list(line) for line1 in urllist: #num+=1 #print line1 #print "共有%d"%num+"个连接" time.sleep(1) workstr= get_zhiwei(line1) print workstr print "正在写入...." wirtetxt(workstr) #savefile.write((workstr).encode("utf-8")) #savefile.close()
zhaopin_wordcloud.py
#coding:utf-8
import jieba
import matplotlib import matplotlib.pyplot as plt import wordcloud from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS#词云 import numpy as np #科学计算 from PIL import Image #处理图片 #打开文本 testfile = open("info.txt").read() #数据清洗,去除不重要词语 testfile = testfile.\ replace("family","").replace("span","").replace("font","").replace("color","").replace("14px","").replace("rgb","").\ replace("size","").replace("br","").replace("0px","").replace("宋体","").replace("margin","").replace("line","").\ replace("style","").replace("line","").replace("height","").replace("white","").\ replace("熟悉","").replace("nbsp","").replace("background","").replace("normal","").replace("margin","").replace("平台","").\ replace("space","").replace("padding","").replace("bottom","").replace("top","").\ replace("技术","").replace("工做","").replace("text","").replace("indent","").replace("letter","").replace("stretch","").\ replace("25px","").replace("应用","").replace("simsun","").replace("strong","").\ replace("系统","").replace("Yahei","").replace("indent","").replace("left","").replace("data","").replace("熟练","").\ replace("Calii","").replace("Microsoft","").replace("Sans","").replace("div","").replace("serif","").replace("19px","").\ replace("设计","").replace("公司","").replace("开发","").replace("了解","").\ replace("熟悉","").replace("进行","").replace("仿宋","").replace("负责","").replace("border","").replace("专业","").\ replace("space","").replace("padding","").replace("优先","").replace("top","").\ replace("技术","").replace("工做","").replace("研发","").replace("要求","").replace("任职","").replace("相关","").\ replace("岗位职责","").replace("计算","").replace("上学","").replace("学历","") wordlist = jieba.cut(testfile,cut_all=True)#切割 space_list =" ".join(wordlist)#连接词语 backgroud = np.array(Image.open("1.jpg"))#背景图片 mywordcloud = WordCloud(background_color="white",#背景颜色 mask=backgroud,#写字用的背景图,从背景提取颜色 stopwords=STOPWORDS,#中止的默认词语 font_path="msyh.ttf",#字体 max_font_size=100,#字体大小 random_state=30,#词云数量 scale=1).generate(space_list)#生成词云 image_color = ImageColorGenerator(backgroud)#生成词云的颜色 plt.imshow(mywordcloud)#显示词云 plt.axis("off") plt.show()
main.py 运行效果:
zhaopin_wordcloud.py 运行生成词云图片: