Commit c6e78aaa authored by Mario Chirinos's avatar Mario Chirinos

chiapas hoy

parent ee0af704
# -*- coding: utf-8 -*-
import scrapy, re
from chiapasHoy.items import NoticiasItem
import datetime
"""
MEDIO:
Chiapas Hoy, Chiapas
......@@ -22,61 +22,43 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
date = datetime.date(int(year), int(month), int(day))
self.baseURL = "http://www.chiapashoy.com.mx/notashoy/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse, cb_kwargs={"date":date})
def parse(self, response):
for link in response.xpath('//main[@class="site-main"]/article/header/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse(self, response, **kwargs):
links = response.css('article').css('h3').css('a::attr(href)').extract()
print(links)
for link in links:
yield scrapy.Request(url=link, callback=self.parse_item, cb_kwargs=kwargs)
nextPage = response.xpath('//*[@class="nav-links"]/a[@class="next page-numbers"]/@href').extract_first()
if nextPage is not None and nextPage != '':
yield scrapy.Request(url=nextPage, callback=self.parse)
yield scrapy.Request(url=nextPage, callback=self.parse, cb_kwargs=kwargs)
def parse_item(self, response):
def parse_item(self, response, **kwargs):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header[@class="entry-header"]/h1').extract_first())
try:
topic = response.xpath('//span[@class="meta-category"]/a/text()').extract()[1]
except:
topic = response.xpath('//span[@class="meta-category"]/a/text()').extract_first()
item['topic'] = topic
author = response.xpath('//span[@class="author vcard"]/a/text()').extract_first()
if author is not None and author != '':
item['author'] = author
# bodyText = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
# for i in range(0, len(bodyText)):
# p = remove_tags(bodyText[i])
# if i <= 2:
for p in response.xpath('//*[@class="entry-content clearfix"]/p').extract():
p = remove_tags(p)
p = p.lstrip().replace(u'\u2013', "-")
result = LOC_RE.match(p)
if result:
location = DAT_RE.sub('', result.group(0))
if location is not None: item['location'] = location
p = LOC_RE.sub('', p)
text += p + "\n"
item['text'] = text.strip()
item['date'] = kwargs["date"].strftime('%Y/%m/%d') #response.xpath('//span[@class="meta-date"]/a/time/@datetime').extract_first()
item['title'] = response.css("h1.entry-title::text").extract_first()
item['topic'] = response.css('li.meta-category').css('a::text').extract_first().replace(" ", "").replace("\n", "")
# for p in response.xpath('//*[@class="entry-content clearfix"]/p').extract():
# text += remove_tags(p) + "\n"
# item['text'] = text
paragraphs = response.css("article").css("div.entry-content").css("p").extract()
item['author'] = remove_tags(paragraphs[-1])
text = ""
for p in paragraphs:
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
print(item)
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment