Commit 58d686cc authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent f3e69d86
...@@ -22,9 +22,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -22,9 +22,13 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day # self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
# self.baseURL = "http://yucatanalamano.com/" + year + "/" + month + "/" + day
urlList = ["http://www.yucatanalamano.com/" + year + "/" + month + "/" + day,
"http://yucatanalamano.com/" + year + "/" + month + "/" + day]
yield scrapy.Request(url=self.baseURL, callback=self.parse) for url in urlList:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -36,7 +40,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -36,7 +40,7 @@ class QuotesSpider(scrapy.Spider):
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages): for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
...@@ -47,7 +51,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -47,7 +51,13 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first() title = response.xpath('//div[@class="main-col"]/div[@itemprop="name"]/text()').extract_first()
if title is None:
title = response.xpath('//div[@class="main-col"]').css('h1').extract_first()
if title is not None:
item['title'] = remove_tags(title)
else:
item['title'] = title
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first() d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment