Commit 15cbb498 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 687a5463
# -*- coding: utf-8 -*-
import scrapy, re
from diarioCoLatino.items import NoticiasItem
"""
MEDIO:
Diario Co Latino, El Salvador
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
MEDIA:
Diario Co Latino, El Salvador
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd diarioCoLatino/
$ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
"""
import scrapy, re
from diarioCoLatino.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
......@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
......@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract():
for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
# La fecha obtenida ya incluye formato y zona horaria
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
item['topic'] = None
news_topic = None
for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n"
......@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text
""" Obtiene autor """
news_author = None
res = AUTH_RE.match(text)
if res:
m = res.group(0)
item['author'] = m[m.find('Por')+len('Por'):].strip()
news_author = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina twitter """
news_twitter = None
res = TW_RE.search(text)
if res:
m = res.group(0)
item['twitter'] = m.strip()
news_twitter = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Obtiene lugar """
news_loc = None
res = LOC_RE.match(text)
if res:
m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa':
item['location'] = m[:m.find('/')].strip()
news_loc = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
else:
......@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text
""" Elimina correo """
news_email = None
res = EM_RE.search(text)
if res:
m = res.group(0)
item['email'] = m.strip()
news_email = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
......@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider):
res = EM_RE.search(text)
if res:
m = res.group(0)
item['email'] = m.strip()
news_email = m.strip()
# text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text
......@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip()
item['text'] = text.strip()
item['url'] = response.url
## News item info ##
item['date'] = news_date
item['title'] = news_title
item['topic'] = news_topic
item['author'] = news_author
item['twitter'] = news_twitter
item['location'] = news_loc
item['email'] = news_email
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
"""
MEDIA:
El Financiero, CDMX
USAGE:
## Get the news from RSS. ##
---------------------------------------------------------------------------------------------
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
import scrapy, re, json
from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
El Financiero, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -16,65 +22,73 @@ def remove_tags(text):
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
"""
Class for Time Zone
"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
## Time zone for CDMX: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
# self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
# 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
# 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = "http://www.elfinanciero.com.mx/rss"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//link/text()').extract()[1:]:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
res = response.xpath('//script[@data-schema="NewsArticle"]').extract_first()
if res is not None : res = remove_tags(res)
resDict = json.loads(res)
dt = resDict['datePublished']
d,t = dt.split()
d = map(int, d.split("-"))
t = map(int, t.split(":"))
item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
title = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
topic = response.xpath('//div[@class="section-line"]').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
else:
item['topic'] = None
topic = remove_tags(topic)
author = response.xpath('//div[@class="note-author"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
author = remove_tags(author)
for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + '\n'
item['text'] = text.strip()
item['url'] = response.url
# print item['title']
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['author'] = author
item['text'] = text.strip()
item['url'] = response.url
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment