百度稿件爬虫（完整版）

2012-09-10

百度文章爬虫（完整版）一代码#coding:gb2312import urllib2,urllib,re,osimport sqlite3,cookielib,time

百度文章爬虫（完整版）

一代码

#coding:gb2312

import urllib2,urllib,re,os
import sqlite3,cookielib,time
'''
??? 百度爬虫类
??? @author:FC_LAMP
'''
class SpiderBaiDu:
???#变量
??? sqlit = None
??? cur?? = None
??? baseurl = 'http://hi.baidu.com'
??? total? = 0
??? #处理单引号
??? def qutoSin(self,string):

??????? return string.replace("'","")

??? #登陆百度空间
??? '''
????? user 为用户名
????? pwd? 为password
??? '''
??? def LoginBaiDu(self,user,pwd):
?????? ?
??????? #设置
??????? cookie = cookielib.CookieJar()
??????? cookieProc = urllib2.HTTPCookieProcessor(cookie)
??????? opener = urllib2.build_opener(cookieProc)
??????? urllib2.install_opener(opener)

??????? #请求
??????? header = {'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'}
??????? post = {
??????????? 'username':user,
??????????? 'password':pwd,
??????????? 'tpl':'mn',
??????????? 'u':'http://www.baidu.com/',
??????????? 'psp_tt':0,
??????????? 'mem_pass':'on'
??????????? }
??????? post = urllib.urlencode(post)
?????? ?
??????? req = urllib2.Request(
??????????? url='https://passport.baidu.com/?login',
??????????? data=post,
??????????? headers = header
??????????? )
??????? res = urllib2.urlopen(req).read(500)
?????? ?
??????? if 'passCookie' in res:
??????????? flag = True
??????? else:
??????????? flag = 'Login Fail:%s'%user
?????????? ?
??????? return flag

??? #创建数据库
??? '''
??????? dbFile 为数据库文件
??? '''
??? def created_db(self,dbFile):
??????? #
??????? dbFile = dbFile.replace("\","/")
??????? self.sqlit = sqlite3.connect(dbFile)
??????? self.cur = self.sqlit.cursor()
??????? sql = """
??????????? CREATE TABLE IF NOT EXISTS `article`
??????????? (
???????????????? a_id char(24) PRIMARY KEY,
???????????????? title varchar(255),
???????????????? created_time datetime,
???????????????? category varchar(255),
???????????????? orgurl varchar(255),
???????????????? content text
??????????? )
??????????? """
??????? self.cur.execute(sql)
??????? self.sqlit.commit()
??????? #注更多关于python数据库的操作：http://hi.baidu.com/fc_lamp/blog/item/3d48778a1f93d06a9e2fb4c3.html
?????? ?
?????? ?
??? #分析页面
??? '''
??????? url 为要分析页面的起始URL
??????? debug 当为True时，将会打印出调试信息
??? '''
??? def parse_html(self,url,debug=False):
?????? ?
??????? while True:
??????????? c = urllib2.urlopen(url).read()
?????????? ?
??????????? #标题，时间
??????????? #正则编义(注意后面的修饰)
??????????? p = re.compile(
??????????????????? r'<div.*?id="?m_blog"?.*?<div.*?src="/img/2012/06/29/1758512093.jpg">

2 数据文件：

?? 百度稿件爬虫（完整版）

#参考资料
#关于python里正则使用参看 1http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html
#??????????????????????? 2http://hi.baidu.com/fc_lamp/blog/item/1e8bab1f258c58e31bd5769e.html

#关于数据库操作,参看http://hi.baidu.com/fc_lamp/blog/item/3d48778a1f93d06a9e2fb4c3.html

#关于cookie登陆问题，参看http://hi.baidu.com/fc_lamp/blog/item/2d947745fc31cf9fb2b7dc0a.html

#关于 python 异常处理，参看http://hi.baidu.com/fc_lamp/blog/item/8a07f31e3e5c56dca7866992.html

#关于保留小数位问题：请参看http://hi.baidu.com/fc_lamp/blog/item/09555100745c3eda267fb554.html

#Python 登陆163邮箱，并获取通讯录http://hi.baidu.com/fc_lamp/blog/item/2466d1096fcc532de8248839.html

ps :爬虫慎用~~~

热点排行

编程

百度稿件爬虫（完整版）