絮叨两句:
博主是一名软件工程系的在校生,利用博客记录本身所学的知识,也但愿能帮助到正在学习的同窗们
人的一辈子中会遇到各类各样的困难和折磨,逃避是解决不了问题的,惟有以乐观的精神去迎接生活的挑战
少年易老学难成,一寸光阴不可轻。
最喜欢的一句话:今日事,今日毕
html
从Python爬虫到Spark预处理数据的真实需求[一]
从Python爬虫到Spark预处理数据的真实需求[二]
从Python爬虫到Spark预处理数据的真实需求[三]
从Python爬虫到Spark预处理数据的真实需求[四]
从Python爬虫到Spark预处理数据的真实需求[五]python
这一章是用来对没有获取到的数据进行再次请求获取,进行更新mysql
提示:如下是本篇文章正文内容,下面案例可供参考web
import requests from fake_useragent import UserAgent import pymysql from bs4 import BeautifulSoup def get_proxy(): return requests.get('http://xxxxxxxxxxx/get/').json()['proxy'] def getHTML(url): proxy = get_proxy() # 获取代理ip ua = UserAgent() # 实例化 # 请求头就能够写成 cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689' headers = {"User-Agent": ua.random, 'Cookie': cookie} trytimes = 100 # 重试的次数 for i in range(trytimes): try: response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1) # response = requests.get(url, headers=headers,timeout=3) # 注意此处也多是302等状态码 if response.status_code == 200: break except: # logdebug(f'requests failed {i}time') print(f'requests failed {i} time','要获取的URL:',url) return response.text def getProduct(https_li_href,brand_name,product_sku,product_Price): db = "UPDATE `xxuan_car_jd_hhs_product` SET " sql = {'skuid': '', 'name': '', 'brand': '', 'price': '', 'url': '', 'commodity_Name': '', 'image': '', 'sales': '', 'material': '', 'type': '', 'ArticleNumbera': '', 'GrossWeight': '' } sql['url'] = https_li_href sql['brand'] = brand_name sql['price'] = product_Price sql['skuid'] = product_sku product_HTML = getHTML(https_li_href) produc_soup = BeautifulSoup(product_HTML, 'html.parser') # 商品标题名称 sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'}) if sku_name_wrap != None: sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'}) if sku_name != None: sku_name = sku_name.text sku_name = str(sku_name).strip() sql['commodity_Name'] = sku_name # 商品图片 spec_img = produc_soup.find('img', attrs={'id': 'spec-img'}) if spec_img == None: spec_img = 'NULL' else: spec_img = spec_img['data-origin'] # print("https:",spec_img) imageURL = f"https:{spec_img}" if imageURL.__contains__('NULL'): sql['image'] = f"NULL" else: sql['image'] = imageURL # 商品规格信息 parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'}) if parameter_list != None: li_all_parameter = parameter_list.findAll('li') for li in li_all_parameter: # print(li) if str(li.text).__contains__('商品名称:'): if li.text == None: sql['name'] = 'NULL', else: sql['name'] = str(li.text).replace('商品名称:', '') elif str(li.text).__contains__('销售规格:'): if li.text == None: sql['sales'] = 'NULL', else: sql['sales'] = str(li.text).replace('销售规格:', '') elif str(li.text).__contains__('产品材质:'): if li.text == None: sql['material'] = 'NULL', else: sql['material'] = str(li.text).replace('产品材质:', '') elif str(li.text).__contains__('产品类型:'): if li.text == None: sql['type'] = 'NULL', else: sql['type'] = str(li.text).replace('产品类型:', '') elif str(li.text).__contains__('货号:'): if li.text == None: sql['ArticleNumbera'] = 'NULL', else: sql['ArticleNumbera'] = str(li.text).replace('货号:', '') elif str(li.text).__contains__('商品毛重:'): if li.text == None: sql['GrossWeight'] = 'NULL', else: sql['GrossWeight'] = str(li.text).replace('商品毛重:', '') # print(sql) for i in sql: if len(str(sql[i])) == 0: sql[i] = 'NULL' if i != "GrossWeight": db += f"{i}='{sql[i]}'," else: db += f"{i}='{sql[i]}' WHERE skuid='{product_sku}' AND brand='{brand_name}' AND price='{product_Price}' AND url='{https_li_href}';" # print(db) ''' 首先生成插入语句,等写入直接source加载 ''' # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w: # w.write(db + '\r') # print(db) ''' 直接插入 ''' # print(db) # conneMysql(db) return db def connectMysql(): conn = pymysql.connect( host='localhost', user='root', password='root', db='jd_qipei', charset='utf8', autocommit=True, # 若是插入数据,, 是否自动提交? 和conn.commit()功能一致。 ) cur = conn.cursor() insert_sql = 'select * from `xxuan_car_jd_hhs_product` where name ="NULL"' cur.execute(insert_sql) ALL_NULL=cur.fetchall() for Null in ALL_NULL: sku=str(Null).split(',')[1].replace("'","").strip() brand=str(Null).split(',')[3].replace("'","").strip() href=str(Null).split(',')[5].replace("'","").strip() price=str(Null).split(',')[4].replace("'","").strip() db=getProduct(https_li_href=href,brand_name=brand,product_sku=sku,product_Price=price) print(db) cur.execute(db) conn.commit() if __name__ == '__main__': connectMysql()
import requests from fake_useragent import UserAgent import pymysql from bs4 import BeautifulSoup def get_proxy(): return requests.get('http://xxxxxxxxxxx/get/').json()['proxy'] def getHTML(url): proxy = get_proxy() # 获取代理ip ua = UserAgent() # 实例化 # 请求头就能够写成 cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689' headers = {"User-Agent": ua.random, 'Cookie': cookie} trytimes = 1000 # 重试的次数 for i in range(trytimes): try: response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1) # response = requests.get(url, headers=headers,timeout=3) # 注意此处也多是302等状态码 if response.status_code == 200: break except: # logdebug(f'requests failed {i}time') print(f'requests failed {i} time','要获取的URL:',url) return response.text def getProduct(https_li_href,brand_name): db = "UPDATE `xxuan_car_jd_mobil_product` SET " sql = {'skuid': '', 'name': '', 'brand': '', 'type': '', 'url': '', 'originplace': '', 'netweight': '', 'commodity_Name': '', 'image': '', 'viscosity': '', 'volume': '' } sql['url'] = https_li_href sql['brand'] = brand_name product_HTML = getHTML(https_li_href) produc_soup = BeautifulSoup(product_HTML, 'html.parser') # 商品标题名称 sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'}) if sku_name_wrap != None: sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'}) if sku_name != None: sku_name = sku_name.text sku_name = str(sku_name).strip() sql['commodity_Name'] = sku_name # print("商品标题名称:",sku_name) # print('商品价格:',li_price) # summary_price = produc_soup.find('div', attrs={'class': 'summary-price J-summary-price'}) # if summary_price != None: # p_price = summary_price.find('div', attrs={'class': 'dd'}).find('span', attrs={'class': 'pricing'}) # if p_price != None: # p_price = str(p_price.text).replace('[', '').replace(']', '').replace('¥', '') # else: # p_price = 'NULL' # sql['price'] = p_price # else: # sql['price'] = 'NULL' # 商品图片 spec_img = produc_soup.find('img', attrs={'id': 'spec-img'}) if spec_img == None: spec_img = 'NULL' else: spec_img = spec_img['data-origin'] # print("https:",spec_img) imageURL = f"https:{spec_img}" if imageURL.__contains__('NULL'): sql['image'] = f"NULL" else: sql['image'] = imageURL # 商品规格信息 parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'}) if parameter_list != None: li_all_parameter = parameter_list.findAll('li') for li in li_all_parameter: if str(li.text).__contains__('商品名称:'): if li.text == None: sql['name'] = 'NULL', else: sql['name'] = str(li.text).replace('商品名称:', '') elif str(li.text).__contains__('商品编号:'): if li.text == None: sql['skuid'] = 'NULL', else: sql['skuid'] = str(li.text).replace('商品编号:', '') elif str(li.text).__contains__('商品毛重:'): if li.text == None: sql['netweight'] = 'NULL', else: sql['netweight'] = str(li.text).replace('商品毛重:', '') elif str(li.text).__contains__('商品产地:'): if li.text == None: sql['originplace'] = 'NULL', else: sql['originplace'] = str(li.text).replace('商品产地:', '') elif str(li.text).__contains__('粘度:'): if li.text == None: sql['viscosity'] = 'NULL', else: sql['viscosity'] = str(li.text).replace('粘度:', '') elif str(li.text).__contains__('机油种类:'): if li.text == None: sql['type'] = 'NULL', else: sql['type'] = str(li.text).replace('机油种类:', '') elif str(li.text).__contains__('容量:'): if li.text == None: sql['volume'] = 'NULL', else: sql['volume'] = str(li.text).replace('容量:', '') # print(sql) for i in sql: if len(str(sql[i])) == 0: sql[i] = 'NULL' if i != "volume": db += f"{i}='{sql[i]}'," else: db += f"{i}='{sql[i]}' WHERE url='{https_li_href}' AND brand='{brand_name}'" ''' 首先生成插入语句,等写入直接source加载 ''' # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w: # w.write(db + '\r') # print(db) ''' 直接插入 ''' # print(db) # conneMysql(db) return db def connectMysql(): conn = pymysql.connect( host='localhost', user='root', password='root', db='jd_qipei', charset='utf8', autocommit=True, # 若是插入数据,, 是否自动提交? 和conn.commit()功能一致。 ) cur = conn.cursor() insert_sql = 'select * from `xxuan_car_jd_mobil_product` where skuid ="NULL"' cur.execute(insert_sql) ALL_NULL=cur.fetchall() for Null in ALL_NULL: brand=str(Null).split(',')[3].replace("'","").strip() href=str(Null).split(',')[6].replace("'","").strip() # price=str(Null).split(',')[4].replace("'","").strip() # print(brand,'-----',href) db=getProduct(https_li_href=href,brand_name=brand) print(db) # print(db) cur.execute(db) conn.commit() if __name__ == '__main__': connectMysql()
import requests from fake_useragent import UserAgent import pymysql from bs4 import BeautifulSoup def get_proxy(): return requests.get('http://xxxxxxxxxxx/get/').json()['proxy'] def getHTML(url): proxy = get_proxy() # 获取代理ip ua = UserAgent() # 实例化 # 请求头就能够写成 cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689' headers = {"User-Agent": ua.random, 'Cookie': cookie} trytimes = 1000 # 重试的次数 for i in range(trytimes): try: response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1) # response = requests.get(url, headers=headers,timeout=3) # 注意此处也多是302等状态码 if response.status_code == 200: break except: # logdebug(f'requests failed {i}time') print(f'requests failed {i} time','要获取的URL:',url) return response.text def getProduct(https_li_href,brand_name,price): db = "UPDATE `xxuan_car_jd_lt_product` SET " sql = { 'skuid':'', 'name':'', 'brand':'', 'url':'', 'price':'', 'commodity_Name':'', 'image':'', 'netweight':'', 'originplace':'', 'size':'', 'width':'', 'number':'', 'performance':'', 'Flattening':'', 'characteristics':'', 'type':'' } sql['url'] = https_li_href sql['brand'] = brand_name sql['price']=price product_HTML = getHTML(https_li_href) produc_soup = BeautifulSoup(product_HTML, 'html.parser') # 商品标题名称 sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'}) if sku_name_wrap != None: sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'}) if sku_name != None: sku_name = sku_name.text sku_name = str(sku_name).strip() sql['commodity_Name'] = sku_name # print("商品标题名称:",sku_name) # print('商品价格:',li_price) # summary_price = produc_soup.find('div', attrs={'class': 'summary-price J-summary-price'}) # if summary_price != None: # p_price = summary_price.find('div', attrs={'class': 'dd'}).find('span', attrs={'class': 'pricing'}) # if p_price != None: # p_price = str(p_price.text).replace('[', '').replace(']', '').replace('¥', '') # else: # p_price = 'NULL' # sql['price'] = p_price # else: # sql['price'] = 'NULL' # 商品图片 spec_img = produc_soup.find('img', attrs={'id': 'spec-img'}) if spec_img == None: spec_img = 'NULL' else: spec_img = spec_img['data-origin'] # print("https:",spec_img) imageURL = f"https:{spec_img}" if imageURL.__contains__('NULL'): sql['image'] = f"NULL" else: sql['image'] = imageURL # 商品规格信息 parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'}) if parameter_list != None: li_all_parameter = parameter_list.findAll('li') for li in li_all_parameter: if str(li.text).__contains__('商品名称:'): if li.text == None: sql['name'] = 'NULL', else: sql['name'] = str(li.text).replace('商品名称:', '') elif str(li.text).__contains__('商品编号:'): if li.text == None: sql['skuid'] = 'NULL', else: sql['skuid'] = str(li.text).replace('商品编号:', '') elif str(li.text).__contains__('商品毛重:'): if li.text == None: sql['netweight'] = 'NULL', else: sql['netweight'] = str(li.text).replace('商品毛重:', '') elif str(li.text).__contains__('商品产地:'): if li.text == None: sql['originplace'] = 'NULL', else: sql['originplace'] = str(li.text).replace('商品产地:', '') elif str(li.text).__contains__('尺寸:'): if li.text == None: sql['size'] = 'NULL', else: sql['size'] = str(li.text).replace('尺寸:', '') elif str(li.text).__contains__('胎面宽度:'): if li.text == None: sql['width'] = 'NULL', else: sql['width'] = str(li.text).replace('胎面宽度:', '') elif str(li.text).__contains__('扁平比:'): if li.text == None: sql['Flattening'] = 'NULL', else: sql['Flattening'] = str(li.text).replace('扁平比:', '') elif str(li.text).__contains__('货号:'): if li.text == None: sql['number'] = 'NULL', else: sql['number'] = str(li.text).replace('货号:', '') elif str(li.text).__contains__('花纹性能:'): if li.text == None: sql['performance'] = 'NULL', else: sql['performance'] = str(li.text).replace('花纹性能:', '') elif str(li.text).__contains__('轮胎特性:'): if li.text == None: sql['characteristics'] = 'NULL', else: sql['characteristics'] = str(li.text).replace('轮胎特性:', '') elif str(li.text).__contains__('车型类别:'): if li.text == None: sql['type'] = 'NULL', else: sql['type'] = str(li.text).replace('车型类别:', '') # print(sql) for i in sql: if len(str(sql[i])) == 0: sql[i] = 'NULL' if i != "type": db += f"{i}='{sql[i]}'," else: db += f"{i}='{sql[i]}' WHERE url='{https_li_href}' AND brand='{brand_name}'" ''' 首先生成插入语句,等写入直接source加载 ''' # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w: # w.write(db + '\r') # print(db) ''' 直接插入 ''' # print(db) # conneMysql(db) return db def connectMysql(): conn = pymysql.connect( host='localhost', user='root', password='root', db='jd_qipei', charset='utf8', autocommit=True, # 若是插入数据,, 是否自动提交? 和conn.commit()功能一致。 ) cur = conn.cursor() insert_sql = 'select * from `xxuan_car_jd_lt_product` where skuid ="NULL"' cur.execute(insert_sql) ALL_NULL=cur.fetchall() for Null in ALL_NULL: brand=str(Null).split(',')[3].replace("'","").strip() href=str(Null).split(',')[4].replace("'","").strip() price=str(Null).split(',')[5].replace("'","").strip() # print(brand,'-----',href) db=getProduct(https_li_href=href,brand_name=brand,price=price) print(db) # print(db) cur.execute(db) conn.commit() if __name__ == '__main__': connectMysql()
import requests from fake_useragent import UserAgent import pymysql from bs4 import BeautifulSoup def get_proxy(): return requests.get('http://xxxxxxxxxxx/get/').json()['proxy'] def getHTML(url): proxy = get_proxy() # 获取代理ip ua = UserAgent() # 实例化 # 请求头就能够写成 cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689' headers = {"User-Agent": ua.random, 'Cookie': cookie} trytimes = 100 # 重试的次数 for i in range(trytimes): try: response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1) # response = requests.get(url, headers=headers,timeout=3) # 注意此处也多是302等状态码 if response.status_code == 200: break except: # logdebug(f'requests failed {i}time') print(f'requests failed {i} time','要获取的URL:',url) return response.text def getProduct(https_li_href,brand_name,product_Sku,product_Price): db = "UPDATE `xxuan_car_jd_scp_product` SET " sql = {'skuid': '', 'name': '', 'brand': '', 'price':'', 'url': '', 'commodity_Name':'', 'image':'', 'Additivetype':'', 'TypesOfAdditives':'', 'NetContent':'', 'ArticleNumber':'', 'boiling':'', 'package':'', 'GrossWeight':'', 'CommodityOrigin':'', 'process':'', 'Installation':'', 'type':'', 'texture':'' } sql['url']=https_li_href sql['brand']=brand_name sql['price']=product_Price sql['skuid']=product_Sku product_HTML = getHTML(https_li_href) produc_soup = BeautifulSoup(product_HTML, 'html.parser') # 商品标题名称 sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'}) if sku_name_wrap != None: sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'}) if sku_name != None: sku_name = sku_name.text sku_name = str(sku_name).strip() sql['commodity_Name'] = sku_name # 商品图片 spec_img = produc_soup.find('img', attrs={'id': 'spec-img'}) if spec_img == None: spec_img = 'NULL' else: spec_img = spec_img['data-origin'] # print("https:",spec_img) imageURL = f"https:{spec_img}" if imageURL.__contains__('NULL'): sql['image'] = f"NULL" else: sql['image'] = imageURL # 商品规格信息 parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'}) if parameter_list != None: li_all_parameter = parameter_list.findAll('li') for li in li_all_parameter: # print(li) if str(li.text).__contains__('商品名称:'): if li.text == None: sql['name'] = 'NULL', else: sql['name'] = str(li.text).replace('商品名称:', '') elif str(li.text).__contains__('商品编号:'): if li.text == None: sql['skuid'] = 'NULL', # pass else: sql['skuid'] =str(li.text).replace('商品编号:', '') # pass elif str(li.text).__contains__('产品类别:'): if li.text == None: sql['type'] = 'NULL', else: sql['type'] = str(li.text).replace('产品类别:', '') elif str(li.text).__contains__('包装规格:'): if li.text == None: sql['package'] = 'NULL', else: sql['package'] = str(li.text).replace('包装规格:', '') elif str(li.text).__contains__('干湿沸点:'): if li.text == None: sql['boiling'] = 'NULL', else: sql['boiling'] = str(li.text).replace('干湿沸点:', '') elif str(li.text).__contains__('货号:'): if li.text == None: sql['ArticleNumber'] = 'NULL', else: sql['ArticleNumber'] = str(li.text).replace('货号:', '') elif str(li.text).__contains__('商品毛重:'): if li.text == None: sql['GrossWeight'] = 'NULL', else: sql['GrossWeight'] = str(li.text).replace('商品毛重:', '') elif str(li.text).__contains__('商品产地:'): if li.text == None: sql['CommodityOrigin'] = 'NULL', else: sql['CommodityOrigin'] = str(li.text).replace('商品产地:', '') elif str(li.text).__contains__('产品工艺:'): if li.text == None: sql['process'] = 'NULL', else: sql['process'] = str(li.text).replace('产品工艺:', '') elif str(li.text).__contains__('安装位置:'): if li.text == None: sql['Installation'] = 'NULL', else: sql['Installation'] = str(li.text).replace('安装位置:', '') elif str(li.text).__contains__('类别:'): if li.text == None: sql['type'] = 'NULL', else: sql['type'] = str(li.text).replace('类别:', '') elif str(li.text).__contains__('材质:'): if li.text == None: sql['texture'] = 'NULL', else: sql['texture'] = str(li.text).replace('材质:', '') # print(sql) for i in sql: if len(str(sql[i])) == 0: sql[i] = 'NULL' if i != "texture": db += f"{i}='{sql[i]}'," else: db += f"{i}='{sql[i]}' WHERE skuid='{product_Sku}' AND brand='{brand_name}' AND price='{product_Price}' AND url='{https_li_href}';" # print(db) ''' 首先生成插入语句,等写入直接source加载 ''' # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w: # w.write(db + '\r') # print(db) ''' 直接插入 ''' # print(db) # conneMysql(db) return db def connectMysql(): conn = pymysql.connect( host='localhost', user='root', password='root', db='jd_qipei', charset='utf8', autocommit=True, # 若是插入数据,, 是否自动提交? 和conn.commit()功能一致。 ) cur = conn.cursor() insert_sql = 'select * from `xxuan_car_jd_scp_product` where name ="NULL"' cur.execute(insert_sql) ALL_NULL=cur.fetchall() for Null in ALL_NULL: sku=str(Null).split(',')[1].replace("'","").strip() brand=str(Null).split(',')[3].replace("'","").strip() href=str(Null).split(',')[5].replace("'","").strip() price=str(Null).split(',')[4].replace("'","").strip() print('sku:',sku,'----brand:',brand,'----href:',href,'----price:',price) db=getProduct(https_li_href=href,brand_name=brand,product_Sku=sku,product_Price=price) print(db) cur.execute(db) conn.commit() if __name__ == '__main__': connectMysql()
import requests from fake_useragent import UserAgent import pymysql from bs4 import BeautifulSoup def get_proxy(): return requests.get('http://xxxxxxxxxxx/get/').json()['proxy'] def getHTML(url): proxy = get_proxy() # 获取代理ip ua = UserAgent() # 实例化 # 请求头就能够写成 cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689' headers = {"User-Agent": ua.random, 'Cookie': cookie} trytimes = 100 # 重试的次数 for i in range(trytimes): try: response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1) # response = requests.get(url, headers=headers,timeout=3) # 注意此处也多是302等状态码 if response.status_code == 200: break except: # logdebug(f'requests failed {i}time') print(f'requests failed {i} time','要获取的URL:',url) return response.text def getProduct(https_li_href,brand_name,product_Price): db = "UPDATE `xxuan_car_jd_tjj_product` SET " sql = {'skuid': '', 'name': '', 'brand': '', 'price': '', 'url': '', 'commodity_Name': '', 'image': '', 'Additivetype': '', 'TypesOfAdditives': '', 'NetContent': '', 'ArticleNumber': '', 'GrossWeight': '', 'CommodityOrigin': '' } sql['url'] = https_li_href sql['brand'] = brand_name sql['price'] = product_Price product_HTML = getHTML(https_li_href) produc_soup = BeautifulSoup(product_HTML, 'html.parser') # 商品标题名称 sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'}) if sku_name_wrap != None: sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'}) if sku_name != None: sku_name = sku_name.text sku_name = str(sku_name).strip() sql['commodity_Name'] = sku_name # 商品图片 spec_img = produc_soup.find('img', attrs={'id': 'spec-img'}) if spec_img == None: spec_img = 'NULL' else: spec_img = spec_img['data-origin'] # print("https:",spec_img) imageURL = f"https:{spec_img}" if imageURL.__contains__('NULL'): sql['image'] = f"NULL" else: sql['image'] = imageURL # 商品规格信息 parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'}) if parameter_list != None: li_all_parameter = parameter_list.findAll('li') for li in li_all_parameter: # print(li) if str(li.text).__contains__('商品名称:'): if li.text == None: sql['name'] = 'NULL', else: sql['name'] = str(li.text).replace('商品名称:', '') elif str(li.text).__contains__('商品编号:'): if li.text == None: sql['skuid'] = 'NULL', # pass else: sql['skuid'] = str(li.text).replace('商品编号:', '') # pass elif str(li.text).__contains__('添加剂类型:'): if li.text == None: sql['Additivetype'] = 'NULL', else: sql['Additivetype'] = str(li.text).replace('添加剂类型:', '') elif str(li.text).__contains__('添加剂种类:'): if li.text == None: sql['TypesOfAdditives'] = 'NULL', else: sql['TypesOfAdditives'] = str(li.text).replace('添加剂种类:', '') elif str(li.text).__contains__('净含量:'): if li.text == None: sql['NetContent'] = 'NULL', else: sql['NetContent'] = str(li.text).replace('净含量:', '') elif str(li.text).__contains__('货号:'): if li.text == None: sql['ArticleNumber'] = 'NULL', else: sql['ArticleNumber'] = str(li.text).replace('货号:', '') elif str(li.text).__contains__('商品毛重:'): if li.text == None: sql['GrossWeight'] = 'NULL', else: sql['GrossWeight'] = str(li.text).replace('商品毛重:', '') elif str(li.text).__contains__('商品产地:'): if li.text == None: sql['CommodityOrigin'] = 'NULL', else: sql['CommodityOrigin'] = str(li.text).replace('商品产地:', '') # print(sql) for i in sql: if len(str(sql[i])) == 0: sql[i] = 'NULL' if i != "CommodityOrigin": db += f"{i}='{sql[i]}'," else: db += f"{i}='{sql[i]}' WHERE brand='{brand_name}' AND price='{product_Price}' AND url='{https_li_href}';" # print(db) ''' 首先生成插入语句,等写入直接source加载 ''' # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w: # w.write(db + '\r') # print(db) ''' 直接插入 ''' # print(db) # conneMysql(db) return db def connectMysql(): conn = pymysql.connect( host='localhost', user='root', password='root', db='jd_qipei', charset='utf8', autocommit=True, # 若是插入数据,, 是否自动提交? 和conn.commit()功能一致。 ) cur = conn.cursor() insert_sql = 'select * from `xxuan_car_jd_tjj_product` where skuid ="NULL"' cur.execute(insert_sql) ALL_NULL=cur.fetchall() for Null in ALL_NULL: brand=str(Null).split(',')[3].replace("'","").strip() href=str(Null).split(',')[5].replace("'","").strip() price=str(Null).split(',')[4].replace("'","").strip() db=getProduct(https_li_href=href,brand_name=brand,product_Price=price) print(db) cur.execute(db) conn.commit() if __name__ == '__main__': connectMysql()
import requests from fake_useragent import UserAgent import pymysql from bs4 import BeautifulSoup def get_proxy(): return requests.get('http://xxxxxxxxxxx/get/').json()['proxy'] def getHTML(url): proxy = get_proxy() # 获取代理ip ua = UserAgent() # 实例化 # 请求头就能够写成 cookie = '__jdu=577937999; areaId=15; ipLoc-djd=15-1213-3410-0; PCSYCityID=CN_330000_330100_330105; shshshfpa=bbe2e678-8333-005c-d01a-b070738f7860-1597809413; shshshfpb=pqSL0Bsl%2FLma%20U3QU6OB1xw%3D%3D; mt_xid=V2_52007VwcUVFVaVFIXQSldVWJWFwVVX05cGx0eQAAyVhRODQhWWQNJH1gEY1QWBwhcWwovShhfBHsCG05eWUNaG0IcVA5mACJQbVhiUh9IGV4MYgMbU1xfV14eQR1bAVcDFFZZ; user-key=68c44d85-8cac-4072-8369-c117f62d8eb3; cn=0; unpl=V2_ZzNtbURfFhZwXEEAKx4OVWJTElsSUUoUdQsRAHkbWgFmCkEKclRCFnQUR11nGl0UZwQZWEVcQxxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZH8fWg1lBRpVSmdzEkU4dlN7EFQGZDMTbUNnAUEpCk5Weh5YSGMFFFVAUUsdfThHZHg%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_79d24e6ea6ca4a17a78012fe337508bf|1597913171572; __jda=122270672.577937999.1597809411.1597906781.1597909689.7; __jdc=122270672; 3AB9D23F7A4B3C9B=3SJYGJUIMVOSXHMAT54Z7M54MSN7POALYPRYHXXL4OTIUAYWVYTBG6AFPA4L4Q5ED37GELWUAZFAMTA6KV6JQSFCHA; shshshfp=b65beee4eac3565989e568b588a5f619; shshshsID=1d65ff1a1d5a2ecc10479b3f2d1ce72b_39_1597913210747; __jdb=122270672.48.577937999|7.1597909689' headers = {"User-Agent": ua.random, 'Cookie': cookie} trytimes = 100 # 重试的次数 for i in range(trytimes): try: response = requests.get(url, headers=headers, proxies={"http": "https://{}".format(proxy)}, timeout=1) # response = requests.get(url, headers=headers,timeout=3) # 注意此处也多是302等状态码 if response.status_code == 200: break except: # logdebug(f'requests failed {i}time') print(f'requests failed {i} time','要获取的URL:',url) return response.text def getProduct(https_li_href,brand_name,product_sku,product_Price): db = "UPDATE `xxuan_car_jd_ycj_product` SET " sql = {'skuid': '', 'name': '', 'brand': '', 'freezing': '', 'url': '', 'originplace': '', 'netweight': '', 'price': '', 'commodity_Name': '', 'image': '', 'category': '', 'package':'', 'boiling':'', 'sales':'', 'installation':'', 'transmission':'' } sql['url']=https_li_href sql['brand']=brand_name sql['skuid']=product_sku sql['price']=product_Price product_HTML = getHTML(https_li_href) produc_soup = BeautifulSoup(product_HTML, 'html.parser') # 商品标题名称 sku_name_wrap = produc_soup.find('div', attrs={'class': 'itemInfo-wrap'}) if sku_name_wrap != None: sku_name = sku_name_wrap.find('div', attrs={'class': 'sku-name'}) if sku_name != None: sku_name = sku_name.text sku_name = str(sku_name).strip() sql['commodity_Name'] = sku_name # 商品图片 spec_img = produc_soup.find('img', attrs={'id': 'spec-img'}) if spec_img == None: spec_img = 'NULL' else: spec_img = spec_img['data-origin'] # print("https:",spec_img) imageURL = f"https:{spec_img}" if imageURL.__contains__('NULL'): sql['image'] = f"NULL" else: sql['image'] = imageURL # 商品规格信息 parameter_list = produc_soup.find('ul', attrs={'class': 'parameter2 p-parameter-list'}) if parameter_list != None: li_all_parameter = parameter_list.findAll('li') for li in li_all_parameter: # print(li) if str(li.text).__contains__('商品名称:'): if li.text == None: sql['name'] = 'NULL', else: sql['name'] = str(li.text).replace('商品名称:', '') elif str(li.text).__contains__('商品编号:'): if li.text == None: # sql['skuid'] = 'NULL', pass else: # sql['skuid'] =str(li.text).replace('商品编号:', '') pass elif str(li.text).__contains__('商品毛重:'): if li.text == None: sql['netweight'] = 'NULL', else: sql['netweight'] = str(li.text).replace('商品毛重:', '') elif str(li.text).__contains__('商品产地:'): if li.text == None: sql['originplace'] = 'NULL', else: sql['originplace'] = str(li.text).replace('商品产地:', '') elif str(li.text).__contains__('产品类别:'): if li.text == None: sql['category'] = 'NULL', else: sql['category'] = str(li.text).replace('产品类别:', '') elif str(li.text).__contains__('冰点:'): if li.text == None: sql['freezing'] = 'NULL', else: sql['freezing'] = str(li.text).replace('冰点:', '') elif str(li.text).__contains__('包装规格:'): if li.text == None: sql['package'] = 'NULL', else: sql['package'] = str(li.text).replace('包装规格:', '') elif str(li.text).__contains__('干湿沸点:'): if li.text == None: sql['boiling'] = 'NULL', else: sql['boiling'] = str(li.text).replace('干湿沸点:', '') elif str(li.text).__contains__('销售规格:'): if li.text == None: sql['sales'] = 'NULL', else: sql['sales'] = str(li.text).replace('销售规格:', '') elif str(li.text).__contains__('安装位置:'): if li.text == None: sql['installation'] = 'NULL', else: sql['installation'] = str(li.text).replace('安装位置:', '') elif str(li.text).__contains__('变速箱类型:'): if li.text == None: sql['transmission'] = 'NULL', else: sql['transmission'] = str(li.text).replace('变速箱类型:', '') print(sql) for i in sql: if len(str(sql[i])) == 0: sql[i] = 'NULL' if i != "transmission": db += f"{i}='{sql[i]}'," else: db += f"{i}='{sql[i]}' WHERE skuid='{product_sku}' AND brand='{brand_name}' AND price='{product_Price}' AND url='{https_li_href}';" # print(db) ''' 首先生成插入语句,等写入直接source加载 ''' # with open('E:\\xxuan_car_jd_mobil_product.txt', 'a', encoding='utf-8') as w: # w.write(db + '\r') # print(db) ''' 直接插入 ''' # print(db) # conneMysql(db) return db def connectMysql(): conn = pymysql.connect( host='localhost', user='root', password='root', db='jd_qipei', charset='utf8', autocommit=True, # 若是插入数据,, 是否自动提交? 和conn.commit()功能一致。 ) cur = conn.cursor() insert_sql = 'select * from `xxuan_car_jd_ycj_product` where name ="NULL"' cur.execute(insert_sql) ALL_NULL=cur.fetchall() for Null in ALL_NULL: sku=str(Null).split(',')[1].replace("'","").strip() brand=str(Null).split(',')[3].replace("'","").strip() href=str(Null).split(',')[5].replace("'","").strip() price=str(Null).split(',')[8].replace("'","").strip() db=getProduct(https_li_href=href,brand_name=brand,product_sku=sku,product_Price=price) cur.execute(db) conn.commit() if __name__ == '__main__': connectMysql()
但愿能帮助到你们谢谢sql