技术:redis +python2.7 + scrapy
运行环境:windows10 + python2.7 + scrapy
demo功能:提供一个完整的基于scrapy 爬虫的完整demo, 包含抓取数据 , 清洗, 保存
ps: 如果少什么模块, 记得使用pip安装哦
抓取https://gitee.com/gists 站点的 代码dmeo信息
利用xpath, 或者节点选择器,找到自己需要的数据
class GiteeSpider(scrapy.Spider):
name = 'gitee'
allowed_domains = ['gitee.com']
start_urls = ['https://gitee.com/gists']
def start_requests(self):
urls = []
for page_index in range(1, 51):#初始化需要抓取的 URL
url = 'https://gitee.com/gists?page=' + str(page_index)
urls.append(url)
for url in urls:#依次抓取
print url
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):#依次处理每个爬取结果
arr_blocks = response.css('.single-block').extract()
data_item = CodesspiderItem()
for b_txt in arr_blocks:
b = Selector(text=b_txt)
arr_titles = b.css('.title-name').extract()
arr_create_time_blocks = b.css('.create-time').extract()
self.process_create_time_label(arr_create_time_blocks[0], data_item)
self.process_title_and_url(arr_titles[0], data_item)
yield scrapy.Request(url=data_item['url'], meta= {'data_item':data_item}, callback=self.parse_content)
#获取代码 demo
def parse_content(self, response):#处理每条记录的详细信息
data_item = response.meta["data_item"]
arr = response.css('.file_content').extract()
if len(arr) > 0:
sel = Selector(text=arr[0])
data_item['content'] = ''.join(sel.xpath('//div/text()').extract())
# print 'parse_content========>========>========>========>========>' + data_item['content']
yield data_item
###处理具体代码demo 的所在地址和标题
def process_title_and_url(self, item, data_item):
obj_a = Selector(text=item)
title = obj_a.xpath('//a/text()').extract()[0]
href = 'https://gitee.com' + obj_a.xpath('//a/@href').extract()[0]
# print title + ' ==> ' + href
data_item['title'] = title
data_item['url'] = href
###处理创建时间和标签,标签可以表示什么编程语言
def process_create_time_label(self, item, data_item):
obj_a = Selector(text=item)
create_time = obj_a.xpath('//div/span/text()').extract()[0]
lang_lables = obj_a.xpath('//div/div/a/text()').extract()
# print create_time + ' ==> ' + ' '.join(lang_lables)
data_item['create_time'] = create_time
data_item['labels'] = lang_lables
parse():获取列表内容,比如标题,创建时间
parse_content():在获 title,创建时间,demo 地址后,下载 demo 代码的内容
核心代码,编写持久化对象item
编写Pipeline, 持久化到redis。持久化item 就是process_item 中的item
https://gitee.com/youlixishi/codes-spider/blob/master/src/codesspider/codesspider/pipelines.py
class CodesspiderPipeline(object):
def process_item(self, item, spider):
r = redis.Redis(**{
'host':'192.168.222.130',##redis 保护模式破除 config set protected-mode "no"
'port':6383,
'db':0
})
blog = {
"title":item['title'],
"labels":item['labels'],
"url":item['url'],
"content":item['content'],
"create_time":item['create_time']
}
r.lpush("list_blogs", json.dumps(blog))
# print 'url info====>====>====>====>====>====>====>====>\n'
# print item
# with open("gitee.gist.txt",'a') as fp:
# fp.write(
# (item['title'] + '$&' + ','.join(item['labels']) + '$&' + item['url'] + '$&' + item['content'] + '$&' + item['create_time'].replace('\n','')).encode('utf-8')
# + '\n')
class blog(object):
def __init__(self, title, labels, url, content, create_time):
self.title = title
self.labels = labels
self.url = url
self.content = content
self.create_time = create_time
注册持久化处理Pipeline
修改settings.py
ITEM_PIPELINES = {
'codesspider.pipelines.CodesspiderPipeline': 300
}
from scrapy import cmdline
if __name__ == '__main__':
cmdline.execute("scrapy crawl gitee".split())
这个就是命令行启动 爬虫
欢迎添加微信,互相学习↑↑↑ -_-
白老虎
programming is not only to solve problems, ways to think
grafana 级连 菜单 templating (variables) 配置
AI 机器人 抓取 微信 聊天中的 百度网盘 分享地址和密码
rocketmq 集群搭建 (2master + 2slave + 2namesrv)