自动获取CSDN博客文章列表
#!/usr/bin/python# -*- coding: utf-8 -*-import re,HTMLParserimport urllib2,osaddr_file='blog_addr.dat'html_file="out.html"diff_flag=0def maenu(file_name): count=1 addr = { "":""} try: fp=open(file_name, "r") except IOError: os.mknod(file_name) print "选择需要下载的链接:" for eachline in fp: if eachline[0] == '#': continue print "%d:%s" %(count,eachline) addr[count]=eachline count=count+1 print "%d:新增一条blog地址" % (count) choice=raw_input('>>') fp.close() if re.findall(choice,'/bq|/bQ|/bquit|/bQUIT'): quit() try: if int(choice) == count : add_blog(file_name) else: return addr[int(choice)] except ValueError: print "输入有误,请重新选择!"def add_blog(file_name): fp=open(file_name,"a") addr=raw_input('输入blog地址:') fp.write(addr) fp.write('\n') print "%s 已经增加到blog地址列表中!" % addr fp.close()def is_set(var_name): try: type(eval(var_name)) except : return 0 else: return 1class parseLinks(HTMLParser.HTMLParser): def handle_starttag(self, tag, attrs): if tag== 'a': for name,value in attrs: if name=='title': result=re.findall("(\<.*?\>)",self.get_starttag_text())[0] p=re.compile('次数|关注|私信') if p.search(result) > 0: pass else: begin=result.index("\"") step1=result[begin+1:] begin=step1.index("\"") address=step1[:begin] step2=step1[begin+1:] begin=step2.index("\"") step3=step2[begin+1:] begin=step3.index("\"") article=step3[:begin] write_file.write("文章标题:%s\n文章地址:%s%s\n" %(article,blog_address,address))def unload(common_url,file_name): outfile=open(file_name,'w') headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib2.Request(url=common_url, headers=headers) outfile.write(urllib2.urlopen(req).read()) outfile.close()def check_file(file_name): if os.path.isfile(file_name): return 1 else: return 0def prt_content(diff_flag,author): file1='%s.article' % author if diff_flag == 1: file2='%s.article.new' % author print "以下是更新内容:" os.system("diff %s %s" % (file1,file2)) os.system("mv %s %s" % (file2,file1)) else: print open(file1,'r').read()def get_blog_info(html_file): line=open(html_file,'r').read() username=re.findall("(var username.*?\;)",line,re.S)[0] blog_address=re.findall("(var blog_address.*?net)",line,re.S)[0] username=re.sub("\"|;|var|=| |username",'',username,10) blog_address=re.sub("\"|;|var|=| |blog_address",'',blog_address,10) return (line,username,blog_address)if __name__ == '__main__': while (is_set('chaper_url') == 0) or (chaper_url is None): chaper_url=maenu(addr_file) print "正在下载文章列表...." unload(chaper_url,html_file) (line,username,blog_address)=get_blog_info(html_file) article_file="%s.article" % username if check_file(article_file)==0: pass else: article_file="%s.article.new" % username diff_flag=1 write_file=open(article_file,'w') write_file.write("作者:%s\n" % username) lParser = parseLinks() lParser.feed(line) write_file.close() lParser.close() prt_content(diff_flag,username) os.remove(html_file)发出来供大家批评指点!
说明:当被下载博客已经在本地存在,则会展示是否有新文章发表,有,则展示新文章的标题和链接。所有文章列表都在 作者名称.article文件中存储!