Commit 23efea2e authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 2e6e04a0
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
# -*- coding: utf-8 -*-
import scrapy, re
import scrapy, re, json
from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
......@@ -32,9 +32,9 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self):
self.tz = UTC()
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
# self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
# 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
# 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = "http://www.elfinanciero.com.mx/rss"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
......@@ -49,15 +49,28 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem()
text = ''
d = response.xpath('//*[@class="details-box"]/input[@id="publicado"]/@value').extract_first()
d = d.replace('/',' ').replace(':',' ').split(' ')
item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), int(d[3]), int(d[4]), tzinfo=self.tz).isoformat('T')
item['topic'] = response.xpath('//*[@class="article-title"]/span/text()').extract_first()
item['title'] = response.xpath('//*[@class="title"]/h1/text()').extract_first()
for p in response.xpath('//*[@class="article-paragraphs"]/p').extract():
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
resDict = json.loads(res)
dt = resDict['datePublished']
d,t = dt.split()
d = map(int, d.split("-"))
t = map(int, t.split(":"))
item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
topic = response.xpath('//div[@class="section-line"]').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
author = response.xpath('//div[@class="note-author"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + '\n'
item['text'] = text
item['author'] = response.xpath('//*[@class="details-box"]/input[@id="editor"]/@value').extract_first()
item['text'] = text.strip()
item['url'] = response.url
# print item['title']
......
File mode changed from 100644 to 100755
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment