Beautifulsoup-基础知识

时间 2019-12-19
标签 beautifulsoup 基础知识繁體版
原文原文链接
soup = BeautifulSoup(html_doc,features='lxml')tag1 = soup.find(name='a')  #找到第一个a标签，返回一soup对象tag2 = soup.find_all(name='a')  #找到全部a标签，返回一列表，列表中全部元素为soup对象tag3 = soup.select('#link2')  #找到id=link2的标签name = tag3.name  #获取标签名attrs = tag3.attrs  #获取属性，返回字典类型tag3.attrs = {'href'='www.baidu.com'}  #修改或添加标签属性del tag3.attrs['href']  #删除标签属性#判断是标签对象仍是文本：from bs4.elementimport Tagtags = soup.find('body').childrenfor tag in tags:　　if type(tag) == Tag:　　　　print(tag)　　else:　　　　print('文本。。。')children  #body中全部儿子标签descendants  #body中全部子子孙孙标签body = soup.find('body')v = body.descendantsclear  #将标签的全部子标签所有清空（保留body标签名）soup.find('body').clear()print(soup)decompose  #将标签的全部子标签所有清空（包括body标签名）soup.find('body').decompose()print(soup)extract  #将标签的全部子标签所有清空（包括body标签名）,返回删除的标签（相似pop）find_allv = soup.find_all(name=['a','div'])  #找到全部a标签和div标签v = soup .find_all(id=['link1','link2'])  #找到全部id=link1或id=link2的标签import rerep = re.compile('^p')v = soup.find_all(name=rep)  #找以p开头的全部标签rep = re.compile('class.*')  # .*匹配除换之外的任意字符，而且有或没有v = soup.find_all(class_=rep)  #找class等于sister开头的rep = re.compile('http://www.baidu.com/static/.*')v = soup.find_all(href=rep)  #通常用于匹配页码get  #获取标签属性tag = soup.find('a')v = tag.get('id')  #获取a标签中的id键值has_attr  #判断是否含有某属性tag = soup.find('a')v = tag.has_attr('id')  #判断a标签是否含有id属性get_text  #获取标签内部文本内容tag = soup.find('a')v = tag.get_text()  #获取a标签内部文本内容index  #标签在某标签中的索引位置tag = soup.find('body')v = tag.index(tag.find('div'))  #找div标签在body中的索引位置is_empty_element  #检查是不是空标签或自闭合标签判断以下标签：br hr input img meta spacer link  frame base当前标签的关联标签soup.next --->soup.find_next(...)soup.next_element --->soup.find_nexxt_element(...)soup.next_elementssoup.next_siblingsoup.next_siblinstag.previoustag.previous_elementtag.previous_elementstag.previous_siblingtag.previous_siblingstag.parenttag.parentsselect,select_one  #CSS选择器append  #追加标签到最后insert  #插入标签到指定位置warp  #包裹unwarp  #解包裹