noticias.py

import scrapy, re


## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22

"""
Crawler descarga noticias hasta el 2017.10.17
"""

TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
	return TAG_RE.sub('', text)

class NoticiasItem(scrapy.Item):
	title = scrapy.Field()
	text = scrapy.Field()
	date = scrapy.Field()
	location = scrapy.Field()
	author = scrapy.Field()
	topic = scrapy.Field()
	url = scrapy.Field()

class QuotesSpider(scrapy.Spider):
	name = "noticias"
	
	def start_requests(self):
		year = getattr(self, 'year', None)
		month = getattr(self, 'month', None)
		day = getattr(self, 'day', None)
		self.baseURL='http://florcastillo.mx/noticias/'+year+'/'+month+'/'+day
		
		yield scrapy.Request(url=self.baseURL, callback=self.parse)


	def parse(self, response):
		pagination = response.css('div.pagination').css('a::attr(href)').extract()
		
		if ( len(pagination) > 0 ):
			pagination = pagination[-1]
			pages = int(pagination[pagination.rfind('/')+1:])
			
			for page in range(0, pages):
				if ( page == 0 ):
					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
				
				else:
					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
		
		else:
			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)


	def parse_page(self, response):
		for link in response.css('div.list-block').xpath('./h3/a/@href').extract():
			yield scrapy.Request(url=link, callback=self.parse_item)

	def parse_item(self, response):
		item = NoticiasItem()
		text = ''
		item['title'] = response.css('div.post-title').css('h1.entry-title::text').extract_first()
		
		d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
		## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
		if d[-6:] != '-06:00':
			d = d[:-6] + '-06:00'
		item['date'] = d
		
		item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2]
		
		for paragraph in response.css('div.post_content').css('p').extract():
			text += remove_tags(paragraph) + '\n'
		item['text'] = text
		item['url'] = response.url
		
		# print item['title']
		yield item