Commit 23efea2e authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 2e6e04a0
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy, re import scrapy, re, json
from elFinanciero.items import NoticiasItem from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
...@@ -32,9 +32,9 @@ class QuotesSpider(scrapy.Spider): ...@@ -32,9 +32,9 @@ class QuotesSpider(scrapy.Spider):
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, # self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8, # 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12} # 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = "http://www.elfinanciero.com.mx/rss" self.baseURL = "http://www.elfinanciero.com.mx/rss"
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
...@@ -49,15 +49,28 @@ class QuotesSpider(scrapy.Spider): ...@@ -49,15 +49,28 @@ class QuotesSpider(scrapy.Spider):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
d = response.xpath('//*[@class="details-box"]/input[@id="publicado"]/@value').extract_first() res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
d = d.replace('/',' ').replace(':',' ').split(' ') resDict = json.loads(res)
item['date'] = datetime(int(d[2]), self.date_parser[d[1].lower()], int(d[0]), int(d[3]), int(d[4]), tzinfo=self.tz).isoformat('T') dt = resDict['datePublished']
item['topic'] = response.xpath('//*[@class="article-title"]/span/text()').extract_first() d,t = dt.split()
item['title'] = response.xpath('//*[@class="title"]/h1/text()').extract_first() d = map(int, d.split("-"))
for p in response.xpath('//*[@class="article-paragraphs"]/p').extract(): t = map(int, t.split(":"))
item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
topic = response.xpath('//div[@class="section-line"]').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
author = response.xpath('//div[@class="note-author"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + '\n' text += remove_tags(p) + '\n'
item['text'] = text item['text'] = text.strip()
item['author'] = response.xpath('//*[@class="details-box"]/input[@id="editor"]/@value').extract_first()
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
......
File mode changed from 100644 to 100755
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment