import pymongo import click # 数据库基本信息 db_configs = { 'type': 'mongo', 'host': '127.0.0.1', 'port': '27017', "user": "", "password": "", 'db_name': 'spider' } class Mongo(): def __init__(self): self.db_name = db_configs.get("db_name") self.host = db_configs.get("host") self.port = db_configs.get("port") self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10) self.username = db_configs.get("user") self.password = db_configs.get("passwd") if self.username and self.password: self.db = self.client[self.db_name].authenticate(self.username, self.password) self.db = self.client[self.db_name] def reset_status(self, col="dianping_seed_data"): self.db[col].update_many({'$or': [{'status': 1}, {'status': 3}]}, {'$set': {"status": 0}}) def reset_all_status(self, col="dianping_seed_data"): self.db[col].update_many({}, {'$set': {"status": 0}}) def add_index(self, col="dianping_seed_data"): # status_code 0:初始,1:开始下载,2下载完了 self.db[col].create_index([('status', pymongo.ASCENDING)], unique=True) def get_index(self, col="dianping_seed_data"): index_list = self.db[col].list_indexes() for index in index_list: print(index) # 找出重复的放入result表中 def find_duplicate(self, col="dianping_seed_data"): """ {'$out': 'result'}:聚合以后将结果写到新的集合result表里。 :param col: :return: """ result = self.db[col].aggregate([ {'$group': { '_id': {'url': "$url"}, '_id_list': {'$addToSet': "$_id"}, ##_id字段添加到返回结果里面去 'status': {'$addToSet': "$status"}, ##status字段添加到返回结果里面去 'count': {'$sum': 1} }}, {'$out': 'result'} ], allowDiskUse=True) for item in result: print(item) return result def delete_dup(self, col="dianping_seed_data"): delete_data = self.db.result.find() try: for d in delete_data: # 保留一条 unique_id_list = d.get("_id_list")[1:] for did in unique_id_list: self.db[col].delete_one({'_id': did}) self.db.result.drop() except Exception as e: print("删除的时候出现问题", e.args) @click.command() @click.option('--s', type=str, default="two", help="状态:all表示所有重置为0,two:表示重置状态为一、3的重置为0") @click.option('--i', type=str, default="a", help="a:增长索引 g:获取索引") @click.option('--d', type=str, default="f", help="d:删除 f:查询并生成聚合以后的结果") def run(s, i, d): m = Mongo() if s: print("获取参数为:", s) if s == "all": print("全部数据状态重置为0:", s) m.reset_all_status() elif s == "two": print("部分数据状态重置为0:", s) if i: if i == "a": m.add_index() elif i == "g": m.get_index() if d: if d == "d": m.delete_dup() elif i == "f": m.find_duplicate() if __name__ == '__main__': m = Mongo() m.delete_dup()