当前位置: 首页 >> 程序设计 >> 用python完成网页抓取的工作
 

用python完成网页抓取的工作

作者:Benyur      来源:csdn     发表时间:2006-07-11     浏览次数:      字号:    

# -*- encoding:UTF-8 -*-
'''
    This is geturl.
    Wirtten by yuzebin : yuzebin@gmail.com
    Important:this script is running in cygwin or linux,if you run at windows
        you need the curl and wget for windows .
'''
class='''
    CGetPage is charge of to get a url , it have three methods to get a page : urllib,curl and wget;
    CParsePage is charge of to parse the page , and return the match;
    CGetMatch is the forcad class to wrap the CGetPage and CParsePage.
'''
history='''
    2006.07.10 version 0.0.0.9 :
       Publish this code to internet , ;-)

    2006.06.27 version 0.0.0.7 :
        refrectoring class CParsePage : return re.match only
        refrectoring class CGetCount : rename to CGetMatch

    2006.06.26 version 0.0.0.3 :
        modify class CParsePage , return re.match

    2006.06.22 version 0.0.0.2 :
        add class CGetCount
        this version is the first workable version.
        add cnsky.

    2006.06.21 initial version 0.0.0.1 :
        add class CGetPage and CParsePage
        cannot work ;-)
'''
import string,re,os,fnmatch,sys,copy,gzip,time,datetime,urllib
from types import *

isDebugMode = False

funcUrlRead = lambda url: urllib.urlopen(url).read()

def funcOutputMessage(msg):
    print str(msg)

def funcDebugInfo(msg):
    if(isDebugMode==True):
        print str(msg)

class CGetPage:
    def __init__(self,url):
        if self.urlCheck(url)==True:
            self.url=url
        else:
            return None

    def urlCheck(self,url):
        #todo , check the url is valid url.        
        return True

    def getPage(self):
        self.page = funcUrlRead(self.url)

    def curlPage(self):
        #call curl to get a page,this requir curl is installed.
        self.page = os.popen("curl -A "" -s "" + self.url + """).read()
        
    def setPath(self,path):
        self.path = path

    def wgetPage(self):
        #call wget to download a url to path,this requir wget is installed.
        os.chdir(self.path)
        os.system('wget -c ' + self.url)

class CParsePage:
    def __init__(self,rule,page):
        if (self.ruleCompile(rule)!=False):
            self.page = page
        else:
            return None

    def ruleCompile(self,rule):
        #compile the rule
        try:
            self.rule = re.compile(rule)
        except:
            return False

    def parsePage(self):
        self.match = re.search(self.rule,unicode(self.page,self.getCharset(self.page)))
        funcDebugInfo(type(self.match))

    def getCharset(self,string):
        import chardet
        #todo : automatic discern the charset
        charset = chardet.detect(string)
        return charset['encoding']

class CGetMatch:
    def __init__(self,url,rule):
        self.url = url
        self.rule = rule
        self.cgetpage = CGetPage(self.url)
        self.cgetpage.getPage()
        self.page = self.cgetpage.page
        self.cparsepage = CParsePage(self.rule,self.cgetpage.page)
    
    def getMatch(self,url,rule):
        self.url = url
        self.rule = rule
        self.cgetpage.url = url
        self.cparsepage.rule = rule
        self.cgetpage.getPage()
        self.page = self.cgetpage.page
        self.cparsepage.page = self.cgetpage.page
        self.cparsepage.parsePage()
        self.match = self.cparsepage.match

if __name__ == '__main__':
    funcOutputMessage('===This is a get url script===')
    runTest()
        
def runTest():
    #initialization
    ccount = CGetMatch('http://www.sina.com.cn','')
    i=0
    
    #1
    try:
        sitename = 'huajun'
        rule = 'hit[587]='47588,([0-9]+)'
        url = 'http://www.onlinedown.net/soft/hitjs/hits47.js'
        i += 1
        ccount.getMatch(url,rule)
        funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
    except:
        pass

    #2
    try:
        sitename = 'skycn'
        rule = u'下载次数:</b>&nbsp;&nbsp;([0-9]+)'
        url = 'http://www.skycn.com/soft/23265.html'
        i += 1
        ccount.getMatch(url,rule)
        funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
    except:
        pass

责任编辑 webmaster

 
 
 
 
 
评论更多>>
 
 
 
发表
 
姓名: QQ:
性别: MSN:
E-mail: 主页:
评分: 1 2 3 4 5
评论内容:
验证码:
  
  • 请遵守《互联网电子公告服务管理规定》及中华人民共和国其他各项有关法律法规。
  • 严禁发表危害国家安全、损害国家利益、破坏民族团结、破坏国家宗教政策、破坏社会稳定、侮辱、诽谤、教唆、淫秽等内容的评论 。
  • 用户需对自己在使用本站服务过程中的行为承担法律责任(直接或间接导致的)。
  • 本站管理员有权保留或删除评论内容。
  • 评论内容只代表网友个人观点,与本网站立场无关。
  •