自动获取CSDN博客稿件列表

2013-10-12

自动获取CSDN博客文章列表#!/usr/bin/python# -*- coding: utf-8 -*-import re,HTMLParserimport urllib2,

自动获取CSDN博客文章列表

#!/usr/bin/python# -*- coding: utf-8 -*-import re,HTMLParserimport urllib2,osaddr_file='blog_addr.dat'html_file="out.html"diff_flag=0def maenu(file_name):    count=1    addr = { "":""}    try:        fp=open(file_name, "r")    except IOError:        os.mknod(file_name)    print "选择需要下载的链接:"    for eachline in fp:        if eachline[0] == '#':            continue        print "%d:%s" %(count,eachline)        addr[count]=eachline        count=count+1    print "%d:新增一条blog地址" % (count)    choice=raw_input('>>')    fp.close()    if re.findall(choice,'/bq|/bQ|/bquit|/bQUIT'):        quit()    try:        if int(choice) == count :            add_blog(file_name)        else:            return addr[int(choice)]    except ValueError:        print "输入有误,请重新选择!"def add_blog(file_name):    fp=open(file_name,"a")    addr=raw_input('输入blog地址:')    fp.write(addr)    fp.write('\n')    print "%s 已经增加到blog地址列表中!" % addr    fp.close()def is_set(var_name):    try:        type(eval(var_name))    except :        return 0    else:        return 1class parseLinks(HTMLParser.HTMLParser):    def handle_starttag(self, tag, attrs):        if tag== 'a':            for name,value in attrs:                if name=='title':                    result=re.findall("(\<.*?\>)",self.get_starttag_text())[0]                    p=re.compile('次数|关注|私信')                    if p.search(result) > 0:                        pass                    else:                        begin=result.index("\"")                        step1=result[begin+1:]                        begin=step1.index("\"")                        address=step1[:begin]                        step2=step1[begin+1:]                        begin=step2.index("\"")                        step3=step2[begin+1:]                        begin=step3.index("\"")                        article=step3[:begin]                        write_file.write("文章标题:%s\n文章地址:%s%s\n" %(article,blog_address,address))def unload(common_url,file_name):    outfile=open(file_name,'w')    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}    req = urllib2.Request(url=common_url, headers=headers)    outfile.write(urllib2.urlopen(req).read())    outfile.close()def check_file(file_name):    if os.path.isfile(file_name):        return 1    else:        return 0def prt_content(diff_flag,author):    file1='%s.article' % author    if diff_flag == 1:        file2='%s.article.new' % author        print "以下是更新内容："        os.system("diff %s %s" % (file1,file2))        os.system("mv %s %s" % (file2,file1))    else:        print open(file1,'r').read()def get_blog_info(html_file):    line=open(html_file,'r').read()        username=re.findall("(var username.*?\;)",line,re.S)[0]    blog_address=re.findall("(var blog_address.*?net)",line,re.S)[0]        username=re.sub("\"|;|var|=| |username",'',username,10)    blog_address=re.sub("\"|;|var|=| |blog_address",'',blog_address,10)    return (line,username,blog_address)if __name__ == '__main__':    while (is_set('chaper_url') == 0) or (chaper_url is None):        chaper_url=maenu(addr_file)    print "正在下载文章列表...."    unload(chaper_url,html_file)    (line,username,blog_address)=get_blog_info(html_file)    article_file="%s.article" % username        if check_file(article_file)==0:        pass    else:        article_file="%s.article.new" % username        diff_flag=1        write_file=open(article_file,'w')    write_file.write("作者:%s\n" % username)        lParser = parseLinks()    lParser.feed(line)    write_file.close()    lParser.close()        prt_content(diff_flag,username)    os.remove(html_file)

发出来供大家批评指点！

说明：当被下载博客已经在本地存在，则会展示是否有新文章发表，有，则展示新文章的标题和链接。所有文章列表都在作者名称.article文件中存储！

热点排行

编程

自动获取CSDN博客稿件列表