Commit 04edd575 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 2726f0b3
# -*- coding: utf-8 -*-
"""
MEDIA:
El Heraldo, Honduras
USAGE:
$ cd heraldoHn/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=9
"""
import scrapy, re, json
from datetime import datetime, date
from heraldoHn.items import NoticiasItem
"""
MEDIO:
El Heraldo, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=9
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
# LOC = re.compile(r'[A-Z].*?, ?[A-Z].+?\.')
class ImportantFlowData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
current_page = scrapy.Field()
section_url = scrapy.Field()
return_url = scrapy.Field()
class ImportantData(scrapy.Item):
CONTINUE_SEARCHING = scrapy.Field()
LAST_LINK = scrapy.Field()
page = scrapy.Field()
section_url = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
......@@ -40,41 +52,41 @@ class QuotesSpider(scrapy.Spider):
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
baseURL = "http://www.elheraldo.hn/"
sectionList = ["tegucigalpa", "economia", "mundo", "revistas/crimenes",
"pais", "sucesos", "deportes", "entretenimiento"]
# sectionList = ["tegucigalpa"]
if self.stopDate is None:
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
for s in sectionList:
info = ImportantData()
info['page'] = 1
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request.meta['item'] = info
flow_info = ImportantFlowData()
flow_info['current_page'] = 1
flow_info['to_next_page'] = False
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
self.stopDate = None
for s in sectionList:
info = ImportantData()
info['page'] = 1
info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request.meta['item'] = info
flow_info = ImportantFlowData()
flow_info['current_page'] = 1
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request.meta['item'] = flow_info
yield request
def parse(self, response):
print response.url
# print response.url
searchData = response.meta['item']
CONTINUE_SEARCHING = True
to_next_page = True
linkSet = set()
if searchData['page'] == 1:
if searchData['current_page'] == 1:
searchData['section_url'] = response.url
linkSet = linkSet.union(set(response.xpath('//article[@id="destacada"]/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()))
......@@ -82,36 +94,42 @@ class QuotesSpider(scrapy.Spider):
linkSet = linkSet.union(set(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
linkSet.remove(searchData['section_url'])
try:
linkSet.remove(searchData['section_url'])
except (KeyError, ValueError), e:
pass
else:
linkSet = linkSet.union(set(response.xpath('//div[@class="contenedor"]/article/a/@href').extract()))
linkSet = linkSet.union(set(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()))
try:
linkSet.remove(searchData['section_url'])
except KeyError:
except (KeyError, ValueError), e:
pass
if len(linkSet) <= 0:
CONTINUE_SEARCHING = False
to_next_page = False
for link in linkSet:
yield scrapy.Request(url=link, callback=self.parse_item)
if CONTINUE_SEARCHING:
searchData['page'] += 1
page = searchData['page']
if to_next_page:
searchData['current_page'] += 1
page = searchData['current_page']
url = searchData['section_url']
request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse)
request.meta['item'] = searchData
yield request
def parse_with_stop_date(self, response):
searchData = response.meta['item']
CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
to_next_page = searchData['to_next_page']
if not CONTINUE_SEARCHING:
if searchData['page'] == 1:
if not to_next_page:
if searchData['current_page'] == 1:
searchData['section_url'] = response.url
linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
......@@ -119,14 +137,17 @@ class QuotesSpider(scrapy.Spider):
linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
linkList.remove(searchData['section_url'])
try:
linkList.remove(searchData['section_url'])
except (KeyError, ValueError), e:
pass
else:
linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
try:
linkList.remove(searchData['section_url'])
except KeyError:
except (KeyError, ValueError), e:
pass
newsList = []
......@@ -135,91 +156,107 @@ class QuotesSpider(scrapy.Spider):
newsList.append(link)
for link in newsList:
info = ImportantData()
info['url'] = response.url
info['page'] = searchData['page']
info['section_url'] = searchData['section_url']
if link == linkList[-1]: info['LAST_LINK'] = True
else: info['LAST_LINK'] = False
flow_info = ImportantFlowData()
flow_info['return_url'] = response.url
flow_info['current_page'] = searchData['current_page']
flow_info['section_url'] = searchData['section_url']
if link == linkList[-1]: flow_info['is_last_link'] = True
else: flow_info['is_last_link'] = False
reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
reqst.meta['item'] = info
reqst.meta['item'] = flow_info
yield reqst
else:
searchData['CONTINUE_SEARCHING'] = False
searchData['page'] += 1
page = searchData['page']
searchData['to_next_page'] = False
searchData['current_page'] += 1
page = searchData['current_page']
url = searchData['section_url']
request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
request.meta['item'] = searchData
yield request
def parse_item(self, response):
item = NoticiasItem()
d = response.xpath('//time/text()').extract_first()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline']
news_date = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
title = newsData['headline']
try:
topic = newsData['articleSection']
except:
topic = None
item['topic'] = topic
# news_loc = None
text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0') + 1:]
item['text'] = text
item['url'] = response.url
# if text.find(u'\u00a0') >= 0:
# loc = text[:text.find(u'\u00a0')] + "."
# m = LOC.match(loc)
# if m:
# news_loc = m.group(0)
# text = text[text.find(u'\u00a0') + 1:]
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
# item['location'] = news_loc
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
d = response.xpath('//time/text()').extract_first()
dt = datetime.strptime(d, '%d.%m.%Y').date()
if dt >= self.stopDate:
info = response.meta['item']
flow_info = response.meta['item']
item = NoticiasItem()
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsData = json.loads(res)
item['date'] = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
item['title'] = newsData['headline']
news_date = datetime.strptime(d, '%d.%m.%Y').isoformat("T")
title = newsData['headline']
try:
topic = newsData['articleSection']
except:
topic = None
item['topic'] = topic
# news_loc = None
text = newsData['articleBody']
if text.find(u'\u00a0') >= 0:
loc = text[:text.find(u'\u00a0')] + "."
m = LOC.match(loc)
if m:
item['location'] = m.group(0)
text = text[text.find(u'\u00a0')+1:]
# if text.find(u'\u00a0') >= 0:
# loc = text[:text.find(u'\u00a0')] + "."
# m = LOC.match(loc)
# if m:
# news_loc = m.group(0)
# text = text[text.find(u'\u00a0')+1:]
item['text'] = text
item['url'] = response.url
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
# item['location'] = news_loc
item['text'] = text.strip()
item['url'] = response.url
yield item
if info['LAST_LINK']:
info['CONTINUE_SEARCHING'] = True
request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = info
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
===============================================================================
THIS VERSION OF La Tribuna Honduras IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
CHANGE ITS ACCESS TO ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_por_dia/foraneos FOLDER.
===============================================================================
"""
import scrapy, re, json
from datetime import date
from tribunaHn.items import NoticiasItem
"""
MEDIO:
La Tribuna, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'\d{4}\/\d{2}\/\d{2}')
class ImportantData(scrapy.Item):
section = scrapy.Field()
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
if self.year is not None and self.month is not None and self.day is not None:
self.stopDate = date(int(self.year), int(self.month), int(self.day))
else:
self.stopDate = None
baseURL = "http://www.latribuna.hn/"
sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
"ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
"pecadillos-idiomaticos", "pildoritas", "columnistas", "editorial", "tribuna-del-pueblo",
"anales-historicos", "cine", "dejando-huellas", "dia-7", "dominicales", "done-un-aula",
"especiales-lt", "la-cobra-pregunta", "la-tribuna-agropecuaria", "la-tribuna-cultural",
"nuestro-orgullo", "turismo"]
# sectionList = ["noticias"]
for s in sectionList:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
def parse(self, response):
CONTINUE_SEARCHING = True
linkList = response.xpath('//div[@id="main"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
linkList.extend(response.xpath('//div[@id="main"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
if self.stopDate is None:
for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
for link in linkList:
res = DAT_RE.search(link)
if res:
dat = map(int, res.group(0).split("/"))
newsDate = date(dat[0], dat[1], dat[2])
if newsDate >= self.stopDate:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
nextPage = response.xpath('//span[@class="next"]/a/@href').extract_first()
if nextPage is not None:
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//header/h1[@itemprop="name"]').extract_first())
try:
topic = response.xpath('//aside[@class="tags"]/ul/li/a/text()').extract()[0]
except:
topic = None
item['topic'] = topic
for p in response.css('div.article-post-content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
"""
===========================================================================
THIS VERSION OF Expreso Chiapas IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
NO LONGER ALLOWS ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_hacia_atras FOLDER.
===========================================================================
"""
"""
MEDIA:
Expreso Chiapas, Chiapas
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd expresoChiapas/
$ scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
"""
import scrapy, re, json
from expresoChiapas.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
Expreso Chiapas
USO:
scrapy crawl noticias --nolog -s filename=2018-01-30.json -a year=2018 -a month=1 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -16,21 +33,28 @@ def remove_tags(text):
CLEAN_RE = re.compile(r'\A.*?\sl\s')
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
## Time zone for Chiapas: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, "year", None)
......@@ -42,6 +66,7 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@class="mag-box-container"]/ul/li/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
......@@ -51,12 +76,13 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
try:
## this date includes time zone ##
res = response.xpath('//article[@id="the-post"]/script[@type="application/ld+json"]').extract_first()
if res is not None:
res = remove_tags(res)
......
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class TribunahnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
La Tribuna, Honduras
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd tribunaHn/
$ scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
"""
import scrapy, re
from tribunaHn.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Honduras: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
baseURL = 'http://www.latribuna.hn/' + year + '/' + month + '/' + day
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@id="main"]').css('h3 > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('span.next > a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
text = ''
item = NoticiasItem()
title = response.css('article.article-post').css('h1').extract_first()
if title is not None: title = remove_tags(title)
topic = None
topic_list = response.css('aside.tags').css('li > a').extract()
if len(topic_list) > 0:
topic = remove_tags(topic_list[0])
for p in response.css('div.article-post-content').css('p').extract():
text += remove_tags(p) + '\n'
## News item info ##
item['date'] = self.news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict
"""
=======================================================================
THIS VERSION OF La Jornada IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
NO LONGER ALLOWS ACCES BY DAY.
THE NEW VERSION CAN BE FOUND IN THE descarga_por_rss FOLDER.
=======================================================================
"""
"""
MEDIO:
......@@ -11,6 +18,12 @@ USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re
from laJornada.items import NoticiasItem
from datetime import date, datetime, timedelta, tzinfo, time
from collections import OrderedDict
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
......
# -*- coding: utf-8 -*-
import scrapy, re
from laJornadaOte.items import NoticiasItem
"""
MEDIO:
La Jornada de Oriente, Puebla
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
MEDIA:
La Jornada de Oriente, Puebla
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd laJornadaOte/
$ scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
import scrapy, re
from laJornadaOte.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
......@@ -27,24 +36,40 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//div[@class="mas_noticias"]/ul[@class="sollet"]/li/a/@href').extract():
for link in response.css('div.content').css('h3.entry-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('ul.page-numbers').css('a.next::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['title'] = response.xpath('//h1[@itemprop="headline"]/text()').extract_first()
## la fecha de la noticia ya incluye la zona horaria
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//span[@itemprop="genre"]/text()').extract_first()
for paragraph in response.xpath('//span[@itemprop="articleBody"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
## news date already includes time zone ##
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('h2.entry-title').extract_first()
if title is not None : title = remove_tags(title)
topic = response.xpath('//a[@rel="category tag"]').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.entry-content > p').extract():
text += remove_tags(p) + '\n'
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
# print item['title']
yield item
# -*- coding: utf-8 -*-
"""
MEDIA:
Lector MX, Yucatán
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd lectorMX/
$ scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
"""
import scrapy, re
from lectorMX.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
"""
MEDIO:
Lector MX, Yucatán
USO:
scrapy crawl noticias --nolog -s filename=2017-03-30.json -a year=2017 -a month=3 -a day=30
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -16,18 +22,23 @@ def remove_tags(text):
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
"""
Class for Time Zone
"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
## Time zone for Yucatán: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
......@@ -35,39 +46,45 @@ class QuotesSpider(scrapy.Spider):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
baseURL = 'http://lectormx.com/' + year + '/' + month + '/' + day
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
def parse(self, response):
for link in response.css('div.paginated_content').css('h2.entry-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.archive-pagination').css('a.next::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_page(self, response):
for link in response.xpath('//h2[@class="title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
for paragraph in response.css('div.post-single-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
title = response.css('h1.entry-title').extract_first()
if title is not None: title = remove_tags(title)
topic = response.xpath('//a[@rel="tag"]').extract_first()
if topic is not None: topic = remove_tags(topic)
for p in response.css('div.entry-content > p').extract():
text += remove_tags(p) + '\n'
## News item info ##
item['date'] = self.news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
# -*- coding: utf-8 -*-
import scrapy, re
from periodicoCorreo.items import NoticiasItem
"""
MEDIO:
Periódico Correo, Guanajuato
USO:
scrapy crawl noticias --nolog -s filename=2018-01-29.json -a year=2018 -a month=1 -a day=29
MEDIA:
Periódico Correo, Guanajuato
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd periodicoCorreo/
$ scrapy crawl noticias --nolog -s filename=2018-01-29.json -a year=2018 -a month=1 -a day=29
"""
import scrapy, re
from periodicoCorreo.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
G_RE = re.compile(r' ?- ?')
EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
""" Regular expression for parsing some data """
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
......@@ -34,54 +46,57 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
lastPage = response.xpath('//*[@class="page-nav td-pb-padding-side"]/a[@class="last"]/text()').extract_first()
if lastPage is not None and lastPage != '':
lastPage = int(lastPage)
for page in range(1, lastPage):
yield scrapy.Request(url=response.url + "page/" + str(page + 1), callback=self.parse_page)
def parse(self, response):
for link in response.css('div.main-content').css('h2.card-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.archive-navigation').css('a.next::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_page(self, response):
for link in response.css('div.td-ss-main-content').css('div.item-details').xpath('./h3/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//time[@class="entry-date updated td-module-date"]/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header/h1').extract_first()).strip()
## news date already includes time zone ##
news_date = response.css('div.entry-meta').css('time.entry-date::attr(datetime)').extract_first()
title = response.css('h1.entry-title').extract_first()
if title is not None: title = remove_tags(title)
try:
topic = response.xpath('//ul[@class="td-category"]/li/a/text()').extract()[-2]
except:
topic = response.xpath('//ul[@class="td-category"]/li/a/text()').extract_first()
item['topic'] = topic
topic = response.css('ul.post-categories').css('a').extract_first()
if topic is not None: topic = remove_tags(topic)
for p in response.xpath('//div[@class="td-post-content td-pb-padding-side"]/p').extract():
entry_content = response.css('div.entry-content > article > p').extract()
if len(entry_content) <= 0: entry_content = response.css('div.pane-content').css('p').extract()
if len(entry_content) <= 0: entry_content = response.css('div.post-content').css('p').extract()
for p in entry_content:
text += remove_tags(p) + "\n"
result = LOC_RE.search(text)
if result:
m = result.group(0)
location = G_RE.sub('', m).strip()
if len(location) <= 35:
item['location'] = location
text = text[text.find(m)+len(m):]
text = EM_RE.sub('', text)
text = TW_RE.sub('', text)
text = TW2_RE.sub('', text)
text = TAG2_RE.sub("\n", text)
text = TAG3_RE.sub('', text)
item['text'] = text.strip()
item['url'] = response.url
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
## deprecated section ##
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# news_loc = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
# -*- coding: utf-8 -*-
import scrapy, re
from periodicoVictoria.items import NoticiasItem
"""
MEDIO:
Periódico Victoria, Durango
USO:
scrapy crawl noticias --nolog -s filename=2018-01-28.json -a year=2018 -a month=1 -a day=28
MEDIA:
Periódico Victoria, Durango
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd periodicoVictoria/
$ scrapy crawl noticias --nolog -s filename=2018-01-28.json -a year=2018 -a month=1 -a day=28
"""
import scrapy, re
from periodicoVictoria.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE = re.compile(r'\A.+?,.+?(\d{1,2}[\s-][a-zA-Z]+[\s-]\d{4})?\s?\.\s?-\s?', re.S)
G_RE = re.compile(r'\s?-\s?')
GT_RE = re.compile(r'&gt;\s?[\w.%+-]+.+')
EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
TW_RE = re.compile(r'Twitter:\s+[\w.%+-]+.')
TW2_RE = re.compile(r'(Twitter:\s+)?(@[\w.%+-]+.)?')
FB_RE = re.compile(r'(Facebook|Blog|Por|Informes|Texto y (F|f)otos):\s+[\w.%+-]+.+')
URL_RE = re.compile(r'\(?(https?:\/\/)?([w{3}.])?[\w%+-]+(\.[a-zA-Z]{2,6}){1,2}[/\w.#$%&+-]*\)?.')
""" Regular expression for parsing some data """
# LOC_RE = re.compile(r'\A.+?,.+?(\d{1,2}[\s-][a-zA-Z]+[\s-]\d{4})?\s?\.\s?-\s?', re.S)
# G_RE = re.compile(r'\s?-\s?')
# GT_RE = re.compile(r'&gt;\s?[\w.%+-]+.+')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'Twitter:\s+[\w.%+-]+.')
# TW2_RE = re.compile(r'(Twitter:\s+)?(@[\w.%+-]+.)?')
# FB_RE = re.compile(r'(Facebook|Blog|Por|Informes|Texto y (F|f)otos):\s+[\w.%+-]+.+')
# URL_RE = re.compile(r'\(?(https?:\/\/)?([w{3}.])?[\w%+-]+(\.[a-zA-Z]{2,6}){1,2}[/\w.#$%&+-]*\)?.')
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://periodicovictoria.mx/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
baseURL = "http://periodicovictoria.mx/" + year + "/" + month.zfill(2) + "/" + day.zfill(2)
yield scrapy.Request(url=baseURL, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.xpath('//ul[@class="pagination"]/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/') + 1:])
for page in range(1, pages):
yield scrapy.Request(url=response.url + "/page/" + str(page + 1), callback=self.parse_page)
def parse(self, response):
for link in response.css('main.site-main').css('h3.entry-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.css('div.pagination').css('li.go-next > a::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_page(self, response):
for link in response.xpath('//div[@class="blog"]').css('div.media-body').xpath('./div/h4/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//div[@class="article-content"]/header/h1').extract_first()).strip()
## news date already includes time zone ##
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
title = response.css('div.entry-wrapper').css('h1.entry-title').extract_first()
if title is not None: title = remove_tags(title)
try:
topic = response.xpath('//ul[@class="breadcrumb"]/li/a/text()').extract()[-2]
except:
topic = response.xpath('//ul[@class="breadcrumb"]/li/a/text()').extract_first()
item['topic'] = topic
topic = response.xpath('//a[@rel="category tag"]').extract_first()
if topic is not None: topic = remove_tags(topic)
author = response.css('aside.author-bio').xpath('./div/h3/a').extract_first()
if author is not None and author != '': item['author'] = remove_tags(author)
for p in response.xpath('//div[@class="post-entry clearfix"]/p').extract():
p = remove_tags(p)
p = p.lstrip().replace(u'\u2013', "-")
p = p.lstrip().replace(u'\u2014', "-")
p = p.lstrip().replace(u'\u00a0', '')
result = LOC_RE.match(p)
if result:
location = G_RE.sub('', result.group(0)).strip()
if location.find("Corresponsal") > -1:
location = location[location.find("Corresponsal") + len("Corresponsal"):].strip()
if len(location) <= 30:
item['location'] = location
p = p.replace(result.group(0), '')
text = ''
text += p + "\n"
text = GT_RE.sub('', text)
text = EM_RE.sub('', text)
text = TW_RE.sub('', text)
text = TW2_RE.sub('', text)
text = FB_RE.sub('', text)
text = URL_RE.sub('', text)
if text.find("Corresponsal") > -1:
text = text[text.find("Corresponsal") + len("Corresponsal"):]
item['text'] = text.strip()
item['url'] = response.url
author = response.xpath('//a[@rel="author"]').extract_first()
if author is not None: author = remove_tags(author)
for p in response.css('div.entry-content').css('p').extract():
p.replace("<br>", "\n")
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['author'] = author
item['text'] = text.strip()
item['url'] = response.url
yield item
## deprecated section ##
# for p in response.xpath('//div[@class="post-entry clearfix"]/p').extract():
# p = remove_tags(p)
# p = p.lstrip().replace(u'\u2013', "-")
# p = p.lstrip().replace(u'\u2014', "-")
# p = p.lstrip().replace(u'\u00a0', '')
# result = LOC_RE.match(p)
# if result:
# location = G_RE.sub('', result.group(0)).strip()
# if location.find("Corresponsal") > -1:
# location = location[location.find("Corresponsal") + len("Corresponsal"):].strip()
# if len(location) <= 30:
# item['location'] = location
# p = p.replace(result.group(0), '')
# text = ''
# text += p + "\n"
# text = GT_RE.sub('', text)
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = FB_RE.sub('', text)
# text = URL_RE.sub('', text)
# if text.find("Corresponsal") > -1:
# text = text[text.find("Corresponsal") + len("Corresponsal"):]
# -*- coding: utf-8 -*-
import scrapy, re
from tribunaCabos.items import NoticiasItem
"""
MEDIO:
Tribuna de los Cabos, Baja California Sur
USO:
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=5
MEDIA:
Tribuna de los Cabos, Baja California Sur
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd tribunaCabos/
$ scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=5
"""
import scrapy, re
from tribunaCabos.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
""" Regular expression for parsing location """
# LOC_RE = re.compile(ur'.+,\s+\d?\d\s+[a-zA-Z]+\.?\s\([^\)]+\)\s?\.-?\u2013?\s?', re.UNICODE)
LOC_RE = re.compile(ur'.+?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]\s?', re.UNICODE)
DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]\s?', re.UNICODE)
DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
""" Regular expression for parsing some data """
# # LOC_RE = re.compile(ur'.+,\s+\d?\d\s+[a-zA-Z]+\.?\s\([^\)]+\)\s?\.-?\u2013?\s?', re.UNICODE)
# LOC_RE = re.compile(ur'.+?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]\s?', re.UNICODE)
# DAT_RE = re.compile(ur',?\s?(\d?\d[\s-][a-zA-Z]+)?\s?(\([^\)]+\))?\s?\.[\u2013-]\s?', re.UNICODE)
# DAT2_RE = re.compile(r',?\sa\s\d{1,2}\sde\s[a-zA-Z]+\sde\s\d{4}')
class UTC(tzinfo):
"""
Class for Time Zone
"""
def utcoffset(self, dt):
## Time zone for Baja California Sur: UTC-6 ##
return timedelta(hours=-6)
def tzname(self, dt):
## Time zone name ##
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "http://www.tribunadeloscabos.com.mx/" + year + "/" + month + "/" + day
self.news_date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
baseURL = "http://www.tribunadeloscabos.com.mx/" + year + "/" + month + "/" + day
yield scrapy.Request(url=baseURL, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
for link in response.xpath('//*[@id="content"]').css('h2').css('a::attr(href)').extract():
for link in response.css('div.blog-posts').css('h2.entry-title > a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
previousPage = response.xpath('//li[@class="previous"]/a/@href').extract_first()
yield scrapy.Request(url=previousPage, callback=self.parse)
page_list = response.css('div.pagination').css('li > a::attr(href)').extract()
if len(page_list) > 0:
next_page = page_list[-1]
current_page = response.css('div.pagination').css('li.active > a::attr(href)').extract_first()
if next_page != current_page:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
title = response.css('div.post-title > h1').extract_first()
if title is not None: title = remove_tags(title)
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//time[@class="entry-date published"]/@datetime').extract_first()
item['title'] = remove_tags(response.xpath('//header[@class="entry-header"]/h1').extract_first()).strip()
item['topic'] = response.xpath('//*[@class="above-entry-meta"]/span/a/text()').extract_first()
author = response.xpath('//span[@class="author vcard"]/a/text()').extract_first()
if author is not None and author != '':
item['author'] = author
topic = response.css('div.post-category > a').extract_first()
if topic is not None: topic = remove_tags(topic)
bodyText = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
for i in range(0, len(bodyText)):
p = remove_tags(bodyText[i])
if i <= 1:
p = p.lstrip()
result = LOC_RE.match(p)
if result:
loc = DAT_RE.sub('', result.group(0))
item['location'] = DAT2_RE.sub('', loc)
p = LOC_RE.sub('', p)
for p in response.css('div.post-content').css('p').extract():
text += remove_tags(p) + "\n"
text += p + "\n"
item['text'] = text
item['url'] = response.url
## News item info ##
item['date'] = self.news_date
item['title'] = title
item['topic'] = topic
item['text'] = text
item['url'] = response.url
yield item
## deprecated section ##
# author = response.xpath('//span[@class="author vcard"]/a/text()').extract_first()
# if author is not None and author != '':
# item['author'] = author
# bodyText = response.xpath('//*[@class="entry-content clearfix"]/p').extract()
# for i in range(0, len(bodyText)):
# p = remove_tags(bodyText[i])
# if i <= 1:
# p = p.lstrip()
# result = LOC_RE.match(p)
# if result:
# loc = DAT_RE.sub('', result.group(0))
# item['location'] = DAT2_RE.sub('', loc)
# p = LOC_RE.sub('', p)
# text += p + "\n"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment