Commit 2e1d39ff authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

merge foraneos

parents 0900a741 40e60fc6
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
{"nombre": "Notisureste", "crawler": "descarga_por_dia/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com"}, {"nombre": "Notisureste", "crawler": "descarga_por_dia/notisureste", "desde": "28-11-2011", "url": "http://www.notisureste.com"},
{"nombre": "Periodico Correo", "crawler": "descarga_por_dia/periodicoCorreo", "desde": "08-09-2013", "url": "https://periodicocorreo.com.mx"}, {"nombre": "Periodico Correo", "crawler": "descarga_por_dia/periodicoCorreo", "desde": "08-09-2013", "url": "https://periodicocorreo.com.mx"},
{"nombre": "Periodico Victoria", "crawler": "descarga_por_dia/periodicoVictoria", "desde": "09-10-2013", "url": "http://periodicovictoria.mx"}, {"nombre": "Periodico Victoria", "crawler": "descarga_por_dia/periodicoVictoria", "desde": "09-10-2013", "url": "http://periodicovictoria.mx"},
{"nombre": "La Prensa Grafica", "crawler": "descarga_hacia_atras/foraneos/prensaGrafica", "url": "https://www.laprensagrafica.com"},
{"nombre": "Proceso", "crawler": "descarga_por_mes/proceso", "desde": "11-1976", "url": "http://www.proceso.com.mx"}, {"nombre": "Proceso", "crawler": "descarga_por_mes/proceso", "desde": "11-1976", "url": "http://www.proceso.com.mx"},
{"nombre": "Punto Medio", "crawler": "descarga_por_dia/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx"}, {"nombre": "Punto Medio", "crawler": "descarga_por_dia/puntoMedio", "desde": "02-08-2015", "url": "https://www.puntomedio.mx"},
{"nombre": "Sona 89.3", "crawler": "descarga_por_dia/sona893", "desde": "09-04-2012", "url": "http://sona893.fm"}, {"nombre": "Sona 89.3", "crawler": "descarga_por_dia/sona893", "desde": "09-04-2012", "url": "http://sona893.fm"},
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import json
import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
else:
year = today.date().year
print year
try:
os.makedirs(str(year))
except:
print "ok"
os.chdir(str(year))
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
filename = "noticias.json"
if len(lstDays) > 0:
strDate = lstDays[len(lstDays)-1]
print strDate
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
else:
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else:
os.chdir(scrapyDir)
os.system("python3 parse_date_files.py " + s['crawler'])
os.chdir(media)
mediaYears = os.listdir(".")
mediaYears.sort()
for yy in mediaYears:
os.chdir(yy)
try:
os.makedirs(baseDir + media + "/" + yy)
except:
pass
mediaDays = os.listdir(".")
mediaDays = [l for l in mediaDays if not l.startswith('.')]
mediaDays.sort()
for dd in mediaDays:
os.system("mv " + dd + " " + baseDir + media + "/" + yy)
os.chdir("..")
os.system("rm -R " + yy)
os.chdir("..")
os.system("rm -R " + media)
os.chdir(s['crawler'])
os.system("rm " + filename)
os.chdir(mydir)
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class PrensagraficaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for prensaGrafica project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'prensaGrafica'
SPIDER_MODULES = ['prensaGrafica.spiders']
NEWSPIDER_MODULE = 'prensaGrafica.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'prensaGrafica (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'prensaGrafica.middlewares.PrensagraficaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'prensaGrafica.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'prensaGrafica.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re, json
from datetime import date
from prensaGrafica.items import NoticiasItem
"""
MEDIO:
La Prensa Gráfica, El Salvador
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'-\d{8}-')
class ImportantData(scrapy.Item):
section = scrapy.Field()
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
if self.year is not None and self.month is not None and self.day is not None:
self.stopDate = date(int(self.year), int(self.month), int(self.day))
else:
self.stopDate = None
self.baseURL = "https://www.laprensagrafica.com"
self.baseURI = self.baseURL + "/ajax/get_section_news.html?viewmore=%2Fajax%2Fget_section_news.html&page="
self.sectionURI = "&size=6&section="
sectionList = ["elsalvador", "departamento15", "internacionales", "deportes", "economia",
"opinion", "techlife", "farandula", "salud", "tendencias", "mujer", "turismo"]
# sectionList = ["elsalvador"]
for s in sectionList:
yield scrapy.Request(url=self.baseURL + "/seccion/" + s, callback=self.parse)
def parse(self, response):
searchData = ImportantData()
CONTINUE_SEARCHING = True
section = response.url[response.url.rfind("/") + 1:]
if section == "internacionales": section = "internacional"
if self.stopDate is None:
for link in response.xpath('//div[@class="col-main"]/article/div/h1/a/@href').extract():
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
else:
for link in response.xpath('//div[@class="col-main"]/article/div/h1/a/@href').extract():
res = DAT_RE.search(link)
if res:
dat = res.group(0).replace("-", '')
newsDate = date(int(dat[:4]), int(dat[4:6]), int(dat[6:]))
if newsDate >= self.stopDate:
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
page = 2
url = self.baseURI + str(page) + self.sectionURI + section
searchData['section'] = section
searchData['page'] = page
request = scrapy.Request(url=url, callback=self.continue_searching)
request.meta['item'] = searchData
yield request
def continue_searching(self, response):
searchData = response.meta['item']
CONTINUE_SEARCHING = True
linkList = response.xpath('//article/div/h1/a/@href').extract()
if len(linkList) > 0:
if self.stopDate is None:
for link in linkList:
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
else:
for link in linkList:
res = DAT_RE.search(link)
if res:
dat = res.group(0).replace("-", '')
newsDate = date(int(dat[:4]), int(dat[4:6]), int(dat[6:]))
if newsDate >= self.stopDate:
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
else:
CONTINUE_SEARCHING = False
if CONTINUE_SEARCHING:
searchData['page'] += 1
page = searchData['page']
section = searchData['section']
url = self.baseURI + str(page) + self.sectionURI + section
request = scrapy.Request(url=url, callback=self.continue_searching)
request.meta['item'] = searchData
yield request
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
newsData = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsDict = json.loads(newsData)
item['date'] = newsDict['datePublished']
item['title'] = newsDict['headline']
try:
topic = response.xpath('//div[@class="news-line"]/a/text()').extract()[-1]
except:
topic = None
item['topic'] = topic
author = response.css('div.content-author').xpath('./p/meta[@itemprop="name"]/@content').extract_first()
if author is not None:
item['author'] = author
for p in response.css('div.news-body').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = prensaGrafica.settings
[deploy]
#url = http://localhost:6800/
project = prensaGrafica
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re, json
from datetime import date
from tribunaHn.items import NoticiasItem
"""
MEDIO:
La Tribuna, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'\d{4}\/\d{2}\/\d{2}')
class ImportantData(scrapy.Item):
section = scrapy.Field()
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
if self.year is not None and self.month is not None and self.day is not None:
self.stopDate = date(int(self.year), int(self.month), int(self.day))
else:
self.stopDate = None
baseURL = "http://www.latribuna.hn/"
# self.baseURI = self.baseURL + "/ajax/get_section_news.html?viewmore=%2Fajax%2Fget_section_news.html&page="
# self.sectionURI = "&size=6&section="
sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
"ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
"pecadillos-idiomaticos", "pildoritas", "columnistas", "editorial", "tribuna-del-pueblo",
"anales-historicos", "cine", "dejando-huellas", "dia-7", "dominicales", "done-un-aula",
"especiales-lt", "la-cobra-pregunta", "la-tribuna-agropecuaria", "la-tribuna-cultural",
"nuestro-orgullo", "turismo"]
# sectionList = ["noticias"]
for s in sectionList:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
# searchData = ImportantData()
CONTINUE_SEARCHING = True
linkList = response.xpath('//section[@class="section-67"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
linkList.extend(response.xpath('//section[@class="section-67"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
if self.stopDate is None:
for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
for link in linkList:
res = DAT_RE.search(link)
if res:
dat = map(int, res.group(0).split("/"))
newsDate = date(dat[0], dat[1], dat[2])
if newsDate >= self.stopDate:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
nextPage = response.xpath('//span[@class="next"]/a/@href').extract_first()
if nextPage is not None:
yield scrapy.Request(url=nextPage, callback=self.parse)
# def continue_searching(self, response):
# searchData = response.meta['item']
# CONTINUE_SEARCHING = True
#
# linkList = response.xpath('//article/div/h1/a/@href').extract()
#
# if len(linkList) > 0:
# if self.stopDate is None:
# for link in linkList:
# yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
#
# else:
# for link in linkList:
# res = DAT_RE.search(link)
# if res:
# dat = res.group(0).replace("-", '')
# newsDate = date(int(dat[:4]), int(dat[4:6]), int(dat[6:]))
# if newsDate >= self.stopDate:
# yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
#
# else:
# CONTINUE_SEARCHING = False
# break
#
# else:
# CONTINUE_SEARCHING = False
#
# if CONTINUE_SEARCHING:
# searchData['page'] += 1
# page = searchData['page']
# section = searchData['section']
# url = self.baseURI + str(page) + self.sectionURI + section
# request = scrapy.Request(url=url, callback=self.continue_searching)
# request.meta['item'] = searchData
# yield request
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
newsData = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
newsDict = json.loads(newsData)
item['date'] = newsDict['datePublished']
item['title'] = newsDict['headline']
try:
topic = response.xpath('//div[@class="news-line"]/a/text()').extract()[-1]
except:
topic = None
item['topic'] = topic
author = response.css('div.content-author').xpath('./p/meta[@itemprop="name"]/@content').extract_first()
if author is not None:
item['author'] = author
for p in response.css('div.news-body').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
import json, os, sys import json, os, sys
from datetime import datetime from datetime import datetime
from collections import OrderedDict
""" """
Uso: Uso:
python parse_date_files.py <nombre_del_crawler> python parse_date_files.py <nombre_del_crawler>
Ej. Ej.
python parse_date_files.py laJornadaBC2 python parse_date_files.py descarga_hacia_atras/laJornadaBC2
""" """
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
return row
info = sys.argv[1]
media = info[info.rfind("/") + 1:]
download_type = info[:info.rfind("/")]
this_file_path = os.path.dirname(os.path.realpath(__file__)) this_file_path = os.path.dirname(os.path.realpath(__file__))
json_file_path = this_file_path+'/descarga_hacia_atras/'+sys.argv[1] json_file_path = this_file_path + "/" + download_type + "/" + media
destination_path = this_file_path+'/'+sys.argv[1] destination_path = this_file_path + "/" + media
json_file = json.loads(open(json_file_path+'/noticias.json').read()) json_file = json.loads(open(json_file_path + "/noticias.json").read())
date_set = set() date_set = set()
for news in json_file: for news in json_file:
if news['date'] is not None: if news['date'] is not None:
news_date = news['date'][:news['date'].rfind('T')] news_date = news['date'][:news['date'].rfind("T")]
if len(news_date) > 10: if len(news_date) > 10:
news_date = news['date'][:news['date'].rfind(' ')] news_date = news['date'][:news['date'].rfind(' ')]
if not news_date in date_set: if not news_date in date_set:
date_set.add(news_date) date_set.add(news_date)
print(news_date) print(news_date)
urlSet = set()
try: try:
export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a') export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
except: except:
os.makedirs(destination_path+'/'+news_date[:4]) os.makedirs(destination_path + "/" + news_date[:4])
export_file = open(destination_path+'/'+news_date[:4]+'/'+news_date+'.json', 'a') export_file = open(destination_path + "/" + news_date[:4] + "/" + news_date + ".json", 'a')
counter = 0 counter = 0
export_file.write("[") export_file.write("[")
for line in json_file: for line in json_file:
if line['date'] is not None: if line['date'] is not None:
line_date = line['date'][:line['date'].rfind('T')] line_date = line['date'][:line['date'].rfind("T")]
if len(line_date) > 10: if len(line_date) > 10:
line_date = line['date'][:line['date'].rfind(' ')] line_date = line['date'][:line['date'].rfind(' ')]
if line_date == news_date: if not line['url'] in urlSet and line_date == news_date:
urlSet.add(line['url'])
counter += 1 counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1: if counter == 1:
export_file.write(json.dumps(line)) export_file.write(json.dumps(row))
elif counter > 1: elif counter > 1:
export_file.write(",\n" + json.dumps(line)) export_file.write(",\n" + json.dumps(row))
export_file.write("]") export_file.write("]")
export_file.close() export_file.close()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment