Commit 5c86d2f7 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 58b75087
...@@ -13,7 +13,7 @@ USAGE: ...@@ -13,7 +13,7 @@ USAGE:
import scrapy, re import scrapy, re
from cuartoPoder.items import NoticiasItem from cuartoPoder.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, date, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -40,7 +40,9 @@ class ImportantData(scrapy.Item): ...@@ -40,7 +40,9 @@ class ImportantData(scrapy.Item):
Useful data for the flow of the implementation Useful data for the flow of the implementation
""" """
to_next_page = scrapy.Field() to_next_page = scrapy.Field()
next_page = scrapy.Field() is_last_link = scrapy.Field()
next_page = scrapy.Field()
return_url = scrapy.Field()
...@@ -57,10 +59,17 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,10 +59,17 @@ class QuotesSpider(scrapy.Spider):
self.month = getattr(self, "month", None) self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None) self.day = getattr(self, "day", None)
self.stop_date = date(int(self.year), int(self.month), int(self.day))
self.baseURL = "http://www.cuartopoder.mx" self.baseURL = "http://www.cuartopoder.mx"
first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2)) first_URL = self.baseURL + "/archivo/portada/listado/{1}-{2}-{0}/{1}-{2}-{0}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&" self.second_URL = self.baseURL + "/XStatic/cuartopoder/template/cargaBloque.aspx?strControl=ctrlArchivoResultadosPaginadoListado&"
self.month_parser = {"Enero" : 1, "Mayo" : 5, "Septiembre" : 9,
"Febrero" : 2, "Junio" : 6, "Octubre" : 10,
"Marzo" : 3, "Julio" : 7, "Noviembre" : 11,
"Abril" : 4, "Agosto" : 8, "Diciembre" : 12}
flow_info = ImportantData() flow_info = ImportantData()
flow_info['to_next_page'] = False flow_info['to_next_page'] = False
flow_info['next_page'] = 2 flow_info['next_page'] = 2
...@@ -74,15 +83,26 @@ class QuotesSpider(scrapy.Spider): ...@@ -74,15 +83,26 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
flow_info = response.meta['item'] flow_info = response.meta['item']
page = flow_info['next_page']
for link in response.css('ul.news-list').xpath('./li/h5/a/@href').extract(): if not flow_info['to_next_page']:
flow_info['to_next_page'] = True link_list = response.css('ul.news-list').xpath('./li/h5/a/@href').extract()
news_link = self.baseURL + link
for link in link_list:
yield scrapy.Request(url=news_link, callback=self.parse_item) flow_info = ImportantData()
flow_info['next_page'] = page
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
news_link = self.baseURL + link
request = scrapy.Request(url=news_link, callback=self.parse_item)
request.meta['item'] = flow_info
yield request
if flow_info['to_next_page']: else:
page = flow_info['next_page']
page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page)) page_URL = self.second_URL + "p={3}&eids=&fd={1}-{2}-{0}&fh={1}-{2}-{0}&id=portada".format(self.year, self.month.zfill(2), self.day.zfill(2), str(page))
flow_info['to_next_page'] = False flow_info['to_next_page'] = False
...@@ -94,28 +114,43 @@ class QuotesSpider(scrapy.Spider): ...@@ -94,28 +114,43 @@ class QuotesSpider(scrapy.Spider):
yield request yield request
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() news_date = response.css('ul.metas-list > li > p').extract_first()
text = '' news_date = remove_tags(news_date)
news_date = news_date.split(u'\xa0')
news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T") news_date[1] = news_date[1].strip().replace(",", '')
news_date = date(int(self.year), self.month_parser[news_date[0]], int(news_date[1]))
title = response.css('div.post-title').css('h1').extract_first()
if title is not None : title = remove_tags(title) if news_date == self.stop_date:
flow_info = response.meta['item']
topic = response.css('div.big-title').xpath('./h2/a/span').extract_first() item = NoticiasItem()
if topic is not None : topic = remove_tags(topic) text = ''
for p in response.css('div.post-content').css('p').extract(): news_date = datetime(int(self.year), int(self.month), int(self.day), tzinfo=self.tz).isoformat("T")
p = remove_tags(p)
text += p + "\n" title = response.css('div.post-title').css('h1').extract_first()
if title is not None : title = remove_tags(title)
## News item info ##
item['date'] = news_date topic = response.css('div.big-title').xpath('./h2/a/span').extract_first()
item['title'] = title if topic is not None : topic = remove_tags(topic)
item['topic'] = topic
item['text'] = text.strip() for p in response.css('div.post-content').css('p').extract():
item['url'] = response.url p = remove_tags(p)
text += p + "\n"
yield item
## News item info ##
item['date'] = news_date
item['title'] = title.strip()
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse, dont_filter=True)
request.meta['item'] = flow_info
yield request
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment