Commit 2d809dad authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 11c4aa01
......@@ -8,6 +8,7 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,6 +18,7 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
......@@ -30,6 +32,7 @@ class QuotesSpider(scrapy.Spider):
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
......@@ -43,14 +46,16 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
item['date'] = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment