Commit 58d686cc authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent f3e69d86
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
......@@ -22,9 +22,13 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
# self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
# self.baseURL = "http://yucatanalamano.com/" + year + "/" + month + "/" + day
urlList = ["http://www.yucatanalamano.com/" + year + "/" + month + "/" + day,
"http://yucatanalamano.com/" + year + "/" + month + "/" + day]
yield scrapy.Request(url=self.baseURL, callback=self.parse)
for url in urlList:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
......@@ -36,7 +40,7 @@ class QuotesSpider(scrapy.Spider):
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
......@@ -47,7 +51,13 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.css('div.main_container').css('h1.post-tile::text').extract_first()
title = response.xpath('//div[@class="main-col"]/div[@itemprop="name"]/text()').extract_first()
if title is None:
title = response.xpath('//div[@class="main-col"]').css('h1').extract_first()
if title is not None:
item['title'] = remove_tags(title)
else:
item['title'] = title
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
......
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment