python scrapy 抓取 完整 demo

技术:redis +python2.7 + scrapy 

运行环境:windows10 + python2.7 + scrapy 

demo功能:提供一个完整的基于scrapy 爬虫的完整demo, 包含抓取数据 , 清洗, 保存

ps: 如果少什么模块, 记得使用pip安装哦

清洗数据

抓取https://gitee.com/gists 站点的 代码dmeo信息

利用xpath, 或者节点选择器,找到自己需要的数据 

demo 代码

class GiteeSpider(scrapy.Spider):
    name = 'gitee'
    allowed_domains = ['gitee.com']
    start_urls = ['https://gitee.com/gists']

    def start_requests(self):
        urls = []
        for page_index in range(1, 51):#初始化需要抓取的 URL
            url = 'https://gitee.com/gists?page=' + str(page_index)
            urls.append(url)
        
        for url in urls:#依次抓取
            print url
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):#依次处理每个爬取结果
        arr_blocks = response.css('.single-block').extract()
        data_item = CodesspiderItem()
        for b_txt in arr_blocks:
            b = Selector(text=b_txt)
            arr_titles = b.css('.title-name').extract()
            arr_create_time_blocks = b.css('.create-time').extract()

            self.process_create_time_label(arr_create_time_blocks[0], data_item)
            self.process_title_and_url(arr_titles[0], data_item)
            yield scrapy.Request(url=data_item['url'], meta= {'data_item':data_item}, callback=self.parse_content)
            
    #获取代码 demo
    def parse_content(self, response):#处理每条记录的详细信息
        data_item = response.meta["data_item"]
        arr = response.css('.file_content').extract()
        if len(arr) > 0:
            sel = Selector(text=arr[0])
            data_item['content'] = ''.join(sel.xpath('//div/text()').extract())
            # print 'parse_content========>========>========>========>========>' + data_item['content']
            yield data_item
                
    ###处理具体代码demo 的所在地址和标题 
    def process_title_and_url(self, item, data_item):
        obj_a = Selector(text=item)
        
        title = obj_a.xpath('//a/text()').extract()[0]
        href = 'https://gitee.com' + obj_a.xpath('//a/@href').extract()[0]
        # print title + ' ==> ' + href
        data_item['title'] = title
        data_item['url'] = href

        
    ###处理创建时间和标签,标签可以表示什么编程语言
    def process_create_time_label(self, item, data_item):
        obj_a = Selector(text=item)
        
        create_time = obj_a.xpath('//div/span/text()').extract()[0]
        lang_lables = obj_a.xpath('//div/div/a/text()').extract()
        # print create_time + ' ==> ' + ' '.join(lang_lables)
        data_item['create_time'] = create_time
        data_item['labels'] = lang_lables

parse():获取列表内容,比如标题,创建时间

parse_content():在获 title,创建时间,demo 地址后,下载 demo 代码的内容

 

核心代码,编写持久化对象item

持久化

编写Pipeline, 持久化到redis。持久化item 就是process_item 中的item

https://gitee.com/youlixishi/codes-spider/blob/master/src/codesspider/codesspider/pipelines.py

class CodesspiderPipeline(object):
    def process_item(self, item, spider):
        r = redis.Redis(**{
        'host':'192.168.222.130',##redis 保护模式破除 config set protected-mode "no"
        'port':6383,
        'db':0
        })
        blog = {
            "title":item['title'],
            "labels":item['labels'],
            "url":item['url'],
            "content":item['content'],
            "create_time":item['create_time']
            }
        r.lpush("list_blogs", json.dumps(blog))
        # print 'url info====>====>====>====>====>====>====>====>\n'
        # print item
        # with open("gitee.gist.txt",'a') as fp:
        #     fp.write(
        #         (item['title'] + '$&' + ','.join(item['labels']) + '$&' + item['url'] + '$&' + item['content'] + '$&' + item['create_time'].replace('\n','')).encode('utf-8')
        #          + '\n')


class blog(object):
    def __init__(self, title, labels, url, content, create_time):
         self.title = title 
         self.labels = labels
         self.url = url
         self.content = content
         self.create_time = create_time

 

注册持久化处理Pipeline

修改settings.py

ITEM_PIPELINES = {
   'codesspider.pipelines.CodesspiderPipeline': 300
}

 

添加运行入口run.py(非必须)

from scrapy import cmdline

if __name__ == '__main__':
    cmdline.execute("scrapy crawl gitee".split())

这个就是命令行启动 爬虫

完整项目自己下载测试下把

 

欢迎添加微信,互相学习↑↑↑ -_-

发表评论

全部评论:0条

白老虎

programming is not only to solve problems, ways to think