BeautifulSoup学习笔记
学习自http://rsj217.diandian.com/post/2012-11-01/40041235132
#/usr/bin/env python # -*- coding:utf-8 -*-import urllib2import sysimport refrom bs4 import BeautifulSoup # HTMLfrom bs4 import BeautifulStoneSoup # XMLimport bs4 # ALL doc = ['<html><head><title>Page title</title></head>','<body><p id="firstpara" align="center">This is paragraph <b>one</b>.','<p id="secondpara" align="blah">This is paragraph <b>two</b>.','</html>']# BeautifulSoup 接受一个字符串参数soup = BeautifulSoup(''.join(doc))print type(soup)print type(soup.html)print type(soup.title.string)#BeautifulSoup文档树有三种基本对象print#BeautifulSoup对象html = soup.htmlprint type(html)print htmlprint#BeautifulSoup.Tagtitle = soup.titleprint type(title)print titleprint#BeautifulSoup.NavigableStringcontents = soup.contentsprint type(contents)print contentsprint#使用contents方法查看文档树层级结构print len(soup.contents[0].contents)print soup.contents[0].contents[0]print soup.contents[0].contents[1]print len(soup.contents[0].contents[0])print soup.contents[0].contents[0].contents[0]print soup.contents[0].contents[0].contents[0].contents[0]#获取树的子代元素,类似深度遍历printhead = html.nextprint type(head)print headprinttitle = head.nextprint type(title)print titleprinttitle_content = title.nextprint type(title_content)print title_contentprintbody = title_content.nextprint type(body)print body#使用replacewith方法替换对象printprint headprint head.parenthead.replaceWith('head was replace')print head.parent #输出空,因为原数据保留并被剪除print head #没有改变正常输出print soup.head #输出空,head对象已不存在print soup #文档对象已经被修改print#使用find,findAll方法进行搜索print soup.findAll('p')printprint soup.findAll('p',id='firstpara')print#传一个属性或多个属性对print soup.findAll('p',{'align':'blah'})#使用正则表达式print soup.findAll(id=re.compile("para$"))#读取和修改属性printp1 = soup.pprint p1print p1['id']p1['id'] = 'changeid'print p1 #已被修改print soup #文档对象已经被修改