Commit a16827a4 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

merge with dev

parents 3edb1097 5c86d2f7
......@@ -44,7 +44,7 @@ Se incluyen los siguientes medios nacionales:
Acceso por día:
```bash
http://www.cuartopoder.mx/archivo/portada/listado/30-08-2018/30-08-2018/
http://www.cuartopoder.mx/archivo/portada/listado/8-30-2018/8-30-2018/
```
Uso:
......
......@@ -13,7 +13,7 @@ USAGE:
import scrapy, re
from cuartoPoder.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
from datetime import datetime, date, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
......@@ -40,7 +40,9 @@ class ImportantData(scrapy.Item):
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
next_page = scrapy.Field()
return_url = scrapy.Field()
......@@ -57,10 +59,17 @@ class QuotesSpider(scrapy.Spider):
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.stop_date = date(int(self.year), int(self.month), int(self.day))
self.baseURL = "http://www.cuartopoder.mx"
first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
self.month_parser = {"Enero" : 1, "Mayo" : 5, "Septiembre" : 9,
"Febrero" : 2, "Junio" : 6, "Octubre" : 10,
"Marzo" : 3, "Julio" : 7, "Noviembre" : 11,
"Abril" : 4, "Agosto" : 8, "Diciembre" : 12}
flow_info = ImportantData()
flow_info['to_next_page'] = False
flow_info['next_page'] = 2
......@@ -74,15 +83,26 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response):
flow_info = response.meta['item']
page = flow_info['next_page']
if not flow_info['to_next_page']:
link_list = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
for link in link_list:
flow_info = ImportantData()
flow_info['next_page'] = page
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract():
flow_info['to_next_page'] = True
news_link = self.baseURL + link
request = scrapy.Request(url=news_link, callback=self.parse_item)
request.meta['item'] = flow_info
yield scrapy.Request(url=news_link, callback=self.parse_item)
yield request
if flow_info['to_next_page']:
page = flow_info['next_page']
else:
page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))
flow_info['to_next_page'] = False
......@@ -94,8 +114,15 @@ class QuotesSpider(scrapy.Spider):
yield request
def parse_item(self, response):
news_date = response.css('ul.metas-list > li > p').extract_first()
news_date = remove_tags(news_date)
news_date = news_date.split(u'\xa0')
news_date[1] = news_date[1].strip().replace(",", '')
news_date = date(int(self.year), self.month_parser[news_date[0]], int(news_date[1]))
if news_date == self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
......@@ -113,9 +140,17 @@ class QuotesSpider(scrapy.Spider):
## News item info ##
item['date'] = news_date
item['title'] = title
item['title'] = title.strip()
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse, dont_filter=True)
request.meta['item'] = flow_info
yield request
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment