re模块是python提供的一套关于处理正则表达式的模块。python
做用:搜索正则表达式
import re res = re.search(r"o", "hello world") print(res.group()) # o s = re.search(r"c", "hello world") print(s.group()) # AttributeError: 'NoneType' object has no attribute 'group'
做用:从开头匹配url
import re res = re.match(r"h", "hello world") print(res.group()) # h s = re.match(r"c", "hello world") print(s.group()) # AttributeError: 'NoneType' object has no attribute 'group'
做用:查找全部,返回listspa
import re lst = re.findall(r"\d+", "name Tom age 18 phone 2354786") print(lst) # ['18', '2354786']
import re lst = re.findall(r"www\.(baidu|qq)\.com", "www.baidu.com") print(lst) # ['baidu']
import re lst = re.findall(r"www\.(?:baidu|qq)\.com", "www.baidu.com") print(lst) # ['www.baidu.com']
做用:查找全部,返回迭代器code
import re lst = re.finditer(r"\d+", "name Tom age 18 phone 2354786") for el in lst: print(el.group()) 结果: 18 2354786
做用:分割,返回list对象
import re ret = re.split(r"[abc]", "qwerafjbfcd") # 先按a分割,再按b分割,而后按c分割 print(ret) # ['qwer', 'fj', 'f', 'd']
import re ret = re.split("\d+", "eva3egon4yuan") print(ret) # ['eva', 'egon', 'yuan']
import re ret = re.split("(\d+)", "eva3egon4yuan") print(ret) # ['eva', '3', 'egon', '4', 'yuan']
做用:替换blog
import re ret = re.sub(r"\s", "__", "hello world") print(ret) # hello__world
做用:替换,返回元组(替换的结果,替换次数)utf-8
import re ret = re.subn(r"\s", "__", "name age gender phone") print(ret) # ('name__age__gender__phone', 3)
做用:将正则表达式编译成一个正则表达式对象,进行预加载字符串
import re obj = re.compile(r"\d{3}") ret = obj.search("abc333eee") print(ret.group()) # 333
正则表达式中,"."表示匹配除"\n"之外的全部字符。对于字符串中有换行,此时正则匹配到的则是多个字符串,而利用re.S,"."能够匹配"\n",即获得的就是一个总体字符串。get
from urllib.request import urlopen import re # url url = "url" # 获取所有内容 content = urlopen(url).read().decode() # 预加载正则表达 obj = re.compile(r"正则表达") # 获取特定内容 res = obj.search(content).group("组名")
from urllib.request import urlopen import re # 预加载正则表达式 obj = re.compile(r'<div class="item">.*?<span class="title">(?P<name>.*?)</span>.*?导演: (?P<director>.*?) .*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?<span>(?P<people>.*?)人评价</span>', re.S) def get_content(url): """ 获取内容 :param url: 网址 :return: 网页所有内容 """ content = urlopen(url).read().decode("utf-8") return content def parse_content(content): """ 解析内容 :param content: 网页所有内容 :return: 字典形式的所需内容 """ pc = obj.finditer(content) for el in pc: yield { "name": el.group("name"), "director": el.group("director"), "score": el.group("score"), "people": el.group("people") } def main(): """ 获取并解析内容,将所需内容写入文件中 :return: None """ for i in range(10): url = "https://movie.douban.com/top250?start=%s&filter=" % (i*25) p = parse_content(get_content(url)) with open("movie.txt", mode="a", encoding="utf-8") as f: for el in p: f.write(str(el) + "\n") if __name__ == "__main__": main()