该项目始于我的兴趣,本意为给无代码经验的朋友作到能开箱即用
阅读此文须要少许Scrapy,PyQt 知识,全文仅分享交流 摘要思路,如需可阅读源码,欢迎提 issuepython
基类封装了框架所需方法,框架基于三级页面 (标题-章节-详情页) 网站,内部方法分岔线基于交互思想git
GUI传参并开启后台 >> spider开始工做于重写的start_requests >> 在parse等处理resp的方法后挂起等待选择github
执行顺序为 (1) parse -- frame_book --> (2) parse_section -- frame_section -->(3) yield item frame方法下述讲解web
pipeline对item做最后的下载,更名等处理,至此spider完成一个生命周期,发送结束信号逻辑交回GUI
算法
下面讲解scrapy的各块工做,pickup有点意思的部分缓存
class BaseComicSpider(scrapy.Spider): """改写start_requests""" step = 'loop' current_status = {} print_Q = None current_Q = None step_Q = None bar = None # 此处及以上变量均为交互信号 total = 0 # item 计数,pipeline处讲解 search_url_head = NotImplementedError('须要自定义搜索网址') mappings = {'': ''} # mappings自定义关键字对应网址 # …………………… def parse(self, response): frame_book_results = self.frame_book(response) yield scrapy.Request(url=title_url, ………………) def frame_book(self, response) -> dict: raise NotImplementedError def elect_res(self, elect: list, frame_results: dict, **kw) -> list: # 封装方法实现(1)选择elect与(2)frame方法格式化后的显示result -> # -> 返回[[elected_title1, title1_url], [title2, title2_url]……]的格式数据 pass # …………………… def close(self, reason): # ………处理管道,session等关闭工做 self.print_Q.put('结束信号') # spider生命周期结束
后台执行的实例,简单的二级页面仅需复写两个frame方法,对应的是扩展的基类2session
frame方法功能为定位目标元素位置,实时清洗数据返回给前端显示app
class ComicxxxSpider(BaseComicSpider2): name = 'comicxxx' allowed_domains = ['m.xxx.com'] search_url_head = 'http://m.xxx.com/search/?keywords=' mappings = {'更新': 'http://m.xxx.com/update/', '排名': 'http://m.xxx.com/rank/'} def frame_book(self, response): # …………………… title = target.xpath(title_xpath).get().strip() self.print_Q.put(example_b.format(str(x + 1), title)) # 发送前端print信号,流失显示 def frame_section(self, response): pass # 类上
setting.py自定义部分与部署相关,使用 工具集 的方法读取配置文件构成变量框架
IMAGES_STORE, log_path, PROXY_CUST, LOG_LEVEL = get_info() os.makedirs(f'{log_path}', exist_ok=True) # 日志输出 LOG_FILE = f"{log_path}/scrapy.log" SPECIAL = ['xxxxx']
进度条这个一开始时还不知怎么处理,后来扫了一下Pipeline类的源码发现downloaded方法算较为接近了
def file_path(self, request, response=None, info=None): """图片下载存储前调用此方法,默认为url的md5后字符串,此处修改为自定义的有序命名""" title = sub(r'([|.:<>?*"\\/])', '-', request.item.get('title')) # 对非法字符预处理 section = sub(r'([|.:<>?*"\\/])', '-', request.item.get('section')) page = '第%s页.jpg' % request.item.get('page') spider = self.spiderinfo.spider # setting.py的参数在此使用 basepath = spider.settings.get('IMAGES_STORE') path = f"{basepath}\\特殊\\{title}" if spider.name in spider.settings.get( 'SPECIAL') else f"{basepath}\\{title}\\{section}\\" os.makedirs(path, exist_ok=True) return os.path.join(path, page) def image_downloaded(self, response, request, info): """继承的ImagesPipeline图片(文件)下载完成方法,下载进度条动态显示的实现就在此处""" self.now += 1 # (ComicPipeline)self.now即为现时处理量 spider = self.spiderinfo.spider percent = int((self.now / spider.total) * 100) # spider.total即为item的总任务量 if percent > self.threshold: percent -= int((percent / self.threshold) * 100) # 进度缓慢化(算法待优化) spider.bar.put(int(percent)) # 后台处理百分比进度扔回GUI界面 super(ComicPipeline, self).image_downloaded(response=response,request=request, info=info)
其余:Items与Middlewares要点很少,略过
按键逻辑:槽函数实现,内部实现必定量的按钮禁用方法引导操做
视窗与信息
主视窗textbrowser,流式显示主要数据;总体内联其余视窗,略过
说明按钮通用说明、底下状态栏经过setStatusTip方法于各操做时提供人性化操做提示
进度条,关联 pipeline 的信号输出
节选 Next 按钮逻辑的 槽函数
def next_schedule(self): def start_and_search(): self.log.debug('===--→ -*- searching') self.next_btn.setText('Next') keyword = self.searchinput.text()[6:].strip() index = self.chooseBox.currentIndex() if self.nextclickCnt == 0: # 从section步 回parse步 的话以避免重开 self.bThread = WorkThread(self) def crawl_btn(text): if len(text) > 5: self.crawl_btn.setEnabled(self.step_recv()=='parse section') self.next_btn.setDisabled(self.crawl_btn.isEnabled()) self.chooseinput.textChanged.connect(crawl_btn) self.p = Process(target=crawl_what, args=(index, self.print_Q, self.bar, self.current_Q, self.step_Q)) self.bThread.print_signal.connect(self.textbrowser_load) self.bThread.item_count_signal.connect(self.processbar_load) self.bThread.finishSignal.connect(self.crawl_end) self.p.start() self.bThread.start() self.log.info(f'-*-*- Background thread starting') self.chooseBox.setDisabled(True) self.params_send({'keyword':keyword}) self.log.debug(f'website_index:[{index}], keyword [{keyword}] success ') def _next(): self.log.debug('===--→ nexting') self.judge_retry() # 非retry的时候先把retry=Flase解锁spider的下一步 choose = judge_input(self.chooseinput.text()[5:].strip()) if self.nextclickCnt == 1: self.book_choose = choose # 选0的话这里要爬虫返回书本数量数据 self.book_num = len(self.book_choose) if self.book_num > 1: self.log.info('book_num > 1') self.textBrowser.append(self.warning_(f'警告!!多选书本时不要随意使用 retry<br>')) self.chooseinput.clear() # choose逻辑 交由crawl, next,retry3个btn的schedule控制 self.params_send({'choose': choose}) self.log.debug(f'send choose: {choose} success') self.retrybtn.setEnabled(True) if self.next_btn.text()!='搜索': _next() else: start_and_search() self.nextclickCnt += 1 self.searchinput.setEnabled(False) self.chooseinput.setFocusPolicy(Qt.StrongFocus) self.step_recv() # 封装的self.step_Q处理方法 self.log.debug(f"===--→ next_schedule end (now step: {self.step})\n")
后台爬虫进程建立方法 ,上述UI主线程中Next逻辑的 start_and_search() 调用
def crawl_what(index, print_Q, bar, current_Q, step_Q): spider_what = {1: 'comic1, 2: 'comic2', 3: 'comic3'} freeze_support() process = CrawlerProcess(get_project_settings()) process.crawl(spider_what[index], print_Q=print_Q, bar=bar, current_Q=current_Q, step_Q=step_Q) process.start() process.join() process.stop()
分离UI主线程与工做线程(项目代码中此处可整合爬虫进程一块儿)
class WorkThread(QThread): item_count_signal = pyqtSignal(int) print_signal = pyqtSignal(str) finishSignal = pyqtSignal(str) active = True def __init__(self, gui): super(WorkThread, self).__init__() self.gui = gui def run(self): while self.active: self.msleep(8) if not self.gui.print_Q.empty(): self.msleep(8) self.print_signal.emit(str(self.gui.print_Q.get())) if not self.gui.bar.empty(): self.item_count_signal.emit(self.gui.bar.get()) self.msleep(10) if '完成任务' in self.gui.textBrowser.toPlainText(): self.item_count_signal.emit(100) self.msleep(20) break if self.active: from ComicSpider.settings import IMAGES_STORE self.finishSignal.emit(IMAGES_STORE)
资源处理工具
工具集 utils.py
def get_info(): with open(f'./setting.txt', 'r', encoding='utf-8') as fp: text = fp.read() sv_path = re.search('<([\s\S]+)>', text).group(1) level = re.search('(DEBUG|WARNING|ERROR)', text).group(1) # ……………… def cLog(name, level='INFO', **kw) -> Logger: # 同理读取setting.txt, level = re.search('(DEBUG|WARNING|ERROR)', text).group(1) def judge_input(_input: str) -> list: # 这方法自我感受用起来还挺顺手 """ "6" return [6] // "1+3+5" return [1,3,5] "4-6" return [4,5,6] // "1+4-6" return [1,4,5,6] """
部署实为pyinstaller打包成exe
pyinstaller注意要点:
datas
中每一个值中 前为项目现位置,后为运行时位置;慎用网上传授的('.', '.'),使用不当会使得git体积飞涨debug
、console
设置为True,方便调试 ( 与上述导入模块调试有所关联spec参考
# -*- mode: python -*- block_cipher = None a = Analysis(['crawl_go.py'], pathex=['D:\\xxxxxxxxxxxxxxxx\\ComicSpider'], binaries=[], datas=[('D:\python\Lib\site-packages\scrapy\mime.types','scrapy'), ('D:\python\Lib\site-packages\scrapy\VERSION','scrapy'), ('./ComicSpider','ComicSpider'), ('./GUI', 'GUI'), ('./gui.py', '.'), ('./material_ct.py', '.'), ('./utils.py', '.'), ], # -*- hiddenimports=[], hookspath=[], runtime_hooks=[], excludes=[], win_no_prefer_redirects=False, win_private_assemblies=False, cipher=block_cipher, noarchive=False) pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) exe = EXE(pyz, a.scripts, a.binaries, a.zipfiles, a.datas, [], name='ComicSpider', debug=True, # -*- bootloader_ignore_signals=False, strip=False, upx=True, runtime_tmpdir=None, console=True, icon='exe.ico') # -*-
打包后目录树
├── ComicSpider.exe ├── log │ ├── GUI.log │ └── scrapy.log ├── scrapy.cfg # 经测试过,scrapy.cfg内置到exe中并不起做用,猜想与缓存路径有关,外置无伤大雅 ├── setting.txt
scrapy用在这种单机交互上的效果不错,pyqt方面还只算用到了皮毛 ,就是逻辑写得彷佛不雅,之后学习下策略模式应用
欢迎你们前往 本项目 试用下交流下意见