Commit 15cbb498 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 687a5463
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy, re
from diarioCoLatino.items import NoticiasItem
""" """
MEDIO: MEDIA:
Diario Co Latino, El Salvador Diario Co Latino, El Salvador
USO:
scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23 USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd diarioCoLatino/
$ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
""" """
import scrapy, re
from diarioCoLatino.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
...@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I) ...@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n') EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, "year", None) year = getattr(self, "year", None)
month = getattr(self, "month", None) month = getattr(self, "month", None)
...@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract(): for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
"La fecha obtenida ya incluye formato y zona horaria" # La fecha obtenida ya incluye formato y zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip() news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
item['topic'] = None news_topic = None
for p in response.xpath('//div[@class="entry"]/p').extract(): for p in response.xpath('//div[@class="entry"]/p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
...@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider): ...@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
""" Obtiene autor """ """ Obtiene autor """
news_author = None
res = AUTH_RE.match(text) res = AUTH_RE.match(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['author'] = m[m.find('Por')+len('Por'):].strip() news_author = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
""" Elimina twitter """ """ Elimina twitter """
news_twitter = None
res = TW_RE.search(text) res = TW_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['twitter'] = m.strip() news_twitter = m.strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
""" Obtiene lugar """ """ Obtiene lugar """
news_loc = None
res = LOC_RE.match(text) res = LOC_RE.match(text)
if res: if res:
m = res.group(0) m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa': if m[m.find('/') + 1:].strip().lower() != 'dpa':
item['location'] = m[:m.find('/')].strip() news_loc = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
else: else:
...@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider): ...@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
""" Elimina correo """ """ Elimina correo """
news_email = None
res = EM_RE.search(text) res = EM_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['email'] = m.strip() news_email = m.strip()
# text = text[text.find(m) + len(m):].strip() # text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip() text = text.replace(m, '').strip()
text = "\n" + text text = "\n" + text
...@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider):
res = EM_RE.search(text) res = EM_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['email'] = m.strip() news_email = m.strip()
# text = text[text.find(m) + len(m):].strip() # text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip() text = text.replace(m, '').strip()
text = "\n" + text text = "\n" + text
...@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip() text = text.replace("\nCo Latino\n", '').strip()
item['text'] = text.strip()
## News item info ##
item['date'] = news_date
item['title'] = news_title
item['topic'] = news_topic
item['author'] = news_author
item['twitter'] = news_twitter
item['location'] = news_loc
item['email'] = news_email
item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
yield item yield item
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""
MEDIA:
El Financiero, CDMX
USAGE:
## Get the news from RSS. ##
---------------------------------------------------------------------------------------------
$ cd elFinanciero/
$ scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
import scrapy, re, json import scrapy, re, json
from elFinanciero.items import NoticiasItem from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
El Financiero, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
...@@ -16,65 +22,73 @@ def remove_tags(text): ...@@ -16,65 +22,73 @@ def remove_tags(text):
class UTC(tzinfo): class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)""" """
Class for Time Zone
"""
def utcoffset(self, dt): def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6 ## Time zone for CDMX: UTC-6 ##
return timedelta(hours=-6) return timedelta(hours=-6)
def tzname(self, dt): def tzname(self, dt):
# nombre de la zona horaria ## Time zone name ##
return 'UTC-6' return 'UTC-6'
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC() self.tz = UTC()
# self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
# 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
# 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = "http://www.elfinanciero.com.mx/rss" self.baseURL = "http://www.elfinanciero.com.mx/rss"
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
for link in response.xpath('//link/text()').extract()[1:]: for link in response.xpath('//link/text()').extract()[1:]:
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first()) res = response.xpath('//script[@data-schema="NewsArticle"]').extract_first()
if res is not None : res = remove_tags(res)
resDict = json.loads(res) resDict = json.loads(res)
dt = resDict['datePublished'] dt = resDict['datePublished']
d,t = dt.split() d,t = dt.split()
d = map(int, d.split("-")) d = map(int, d.split("-"))
t = map(int, t.split(":")) t = map(int, t.split(":"))
item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T") news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip() title = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
topic = response.xpath('//div[@class="section-line"]').extract_first() topic = response.xpath('//div[@class="section-line"]').extract_first()
if topic is not None: if topic is not None:
item['topic'] = remove_tags(topic) topic = remove_tags(topic)
else:
item['topic'] = None
author = response.xpath('//div[@class="note-author"]/a').extract_first() author = response.xpath('//div[@class="note-author"]/a').extract_first()
if author is not None: if author is not None:
item['author'] = remove_tags(author) author = remove_tags(author)
for p in response.css('div.content').css('p').extract(): for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + '\n' text += remove_tags(p) + '\n'
item['text'] = text.strip()
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['author'] = author
item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
# print item['title']
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment