# 安装虚拟环境
>> pip install virtualenvwrapper-win
# 新建虚拟环境
>> mkvirtualenv dirname
# 退出虚拟环境
>> deactivate
# 查看当前全部的虚拟环境
>> workon
# 进入某个虚拟环境
>> workon dirname
# 设置虚拟环境默认存放位置
>> 在环境变量中新建一个变量:WORKON_HOME,指定路径便可
复制代码
个人当前环境默认为python3,因此新建python3的虚拟环境不用指明python的路径html
# 新建python3虚拟环境
>> mkvirtualenv dirname
复制代码
# 安装scripy
>> pip3 install scrapy
# 安装pymysql
>> pip3 install pymysql
# 安装pymongo
>> pip3 install pymongo
复制代码
安装Scrapy时报错Failed building wheel for Twistedpython
点这里,去下载相应版本的twisted安装便可mysql
>> pip install Twisted-18.9.0-cp37-cp37m-win_amd64.whl
复制代码
ModuleNotFoundError: No module named 'win32api'sql
>> pip3 install pywin32
复制代码
scrapy startproject Test
复制代码
>> cd Test
>> cd Test
>> scrapy genspider test www.baidu.com
复制代码
settings.py文件中的变量定义数据库
# mongo相关变量
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DB = 'Tencent'
MONGODB_SET = 'jobs'
## mysql相关变量
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PWD = '123456'
MYSQL_DB = 'Tencent'
复制代码
pipelines.py文件api
# 这里的数据库文件相关的配置变量都定义在settings.py里面
from Test.settings import *
import pymongo
import pymysql
class mongoPipeline(object):
def __init__(self):
# 建立链接对象
self.conn = pymongo.MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
# 建立库对象
self.db = self.conn[MONGODB_DB]
# 建立集合对象
self.myset = self.db[MONGODB_SET]
def process_item(self, item, spider):
# 把一个item转换为字段数据类型
d =dict(item)
self.myset.insert_one(d)
return item
class MysqlPipeline(object):
def __init__(self):
self.db = pymysql.connect(
host=MYSQL_HOST,
port = MYSQL_PORT,
user = MYSQL_USER,
password = MYSQL_PWD,
database = MYSQL_DB
)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
ins = 'insert into jobs(career,type,number,address,time,link) values(%s,%s,%s,%s,%s,%s)'
L = [
item['career'],
item['type'],
item['number'],
item['address'],
item['time'],
item['link']
]
self.cursor.execute(ins,L)
self.db.commit()
return item
def close_spider(self,spider):
self.cursor.close()
self.db.close()
print("Mysql 数据库断开链接")
复制代码
ITEM_PIPELINES = {
'Test.pipelines.TestPipeline': 300,
'Test.pipelines.MongoPipeline':250,
'Test.pipelines.MysqlPipeline':200
}
复制代码
# 是否遵照robots协议
ROBOTSTXT_OBEY = False
# headers定义
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Molliza/5.0",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# 日志级别的定义
LOG_LEVEL = "WARNING"
复制代码