uodate

parent ffb846a2
import os
newsDir = '/home/geoint/virtualHDD/m3/noticias'
# newsDir = '/home/cna_service/noticias/'
os.chdir(newsDir)
mediaLst = os.listdir('.')
mediaLst.sort()
vacios_txt = open('vacios.txt','w')
empty_count = 0
for media in mediaLst:
os.chdir(media)
yearLst = os.listdir('.')
yearLst.sort()
for year in yearLst:
os.chdir(year)
fileLst = os.listdir('.')
fileLst.sort()
for file in fileLst:
fileSize = os.stat(file).st_size
if not file.startswith('.') and fileSize <= 3:
empty_count += 1
if empty_count == 1:
vacios_txt.write(media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
else:
vacios_txt.write('\n'+media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
print(media+'/'+year+'/'+file+', '+'File size: '+str(fileSize))
os.system('rm '+file)
os.chdir('..')
os.chdir('..')
vacios_txt.write('\n'+'Total archivos vacios: '+str(empty_count)+'\n')
vacios_txt.close()
print('Total archivos vacios: '+str(empty_count))
\ No newline at end of file
#!/bin/bash
python /home/geoint/crawlerNoticias/crawler_script.py /home/geoint/crawlerNoticias/crawler_data.json
#!/bin/bash
## ------------------------------------------------------------------
## SCRIPT PARA LA DESCARGA AUTOMATICA DE NOTICIAS CON EL CRAWLER
## ------------------------------------------------------------------
site_section=( otros_sitios sitios_yucatan )
other_site_list=( diarioYaqui laJornada laJornadaAgs laJornadaBC laJornadaGro laJornadaMaya laJornadaOte laJornadaSanLuis laJornadaVer laJornadaZac )
yuc_site_list=( alChile desdeElBalcon diarioYucatan grilloPorteno laVerdadYuc lectorMX miPuntoDeVista notirivas notisureste puntoMedio sona893 yucatanALaMano yucatanAlMinuto yucatanEnCorto )
base_path=$HOME/crawler/
cd $base_path # activando el venv (entorno virtual) del crawler
source bin/activate
PATH=$PATH:$HOME/crawler/bin/python:$HOME/crawler/bin/scrapy # rutas donde se encuentran el scrapy y python
export PATH
## CALCULO DEL NUMERO DE DIAS PARA DESCARGAR NOTICIAS -------------------------
function obtain_days() {
local last_date=$1 # parametro 1
local stop_date=$2 # parametro 2
local day_date_1=`date -d "$last_date" '+%j'` # numero del dia del anio de la fecha en "last_date"
local y1=`date -d "$last_date" '+%Y'`
local day_date_2=`date -d "$stop_date" '+%j'`
local y2=`date -d "$stop_date" '+%Y'`
if [ $y1 -eq $y2 ] # si $y1 es igual a $y2
then
local num_days=$(expr $day_date_2 - $day_date_1)
elif [ $y1 -lt $y2 ]
then
local days_date_1=0
for year in `seq $y1 $y2`
do
if [ $year -eq $y1 ]
then
local days_date=$(expr `date -d "$y1-12-31" '+%j'` - $day_date_1)
elif [ $year -eq $y2 ]
then
days_date=$day_date_2
else
days_date=`date -d "$year-12-31" '+%j'`
fi
days_date_1=$(expr $days_date_1 + $days_date)
done
local num_days=$(expr $days_date_1)
fi
return $num_days
}
## ----------------------------------------------------------------------------
## SECUENCIA DE DESCARGA DE NOTICIAS --------------------------------------------------------------
for section in ${site_section[@]}
do
if [ $section = otros_sitios ]
then
list=${other_site_list[@]}
else
list=${yuc_site_list[@]}
fi
for site in $list
do
## POR CADA SITIO ENCUENTRA EL ARCHIVO CON LA ULTIMA FECHA EN QUE SE DESCARGO NOTICIAS ----
cd crawledJsonFiles/$section/$site
max=`ls | tail -1` # obtiene el ultimo directorio
cd $max
json_file=`ls | tail -1` # obtiene el ultimo archivo dentro del directorio
## ----------------------------------------------------------------------------------------
cd ~/crawler
last_date=`date -d "${json_file%%.*}" '+%Y-%m-%d'`
stop_date=`date -d "now" '+%Y-%m-%d'` # descarga hasta una fecha antes de esta
## NOTA: Para que descargue hasta una fecha antes, el paro debe fijarse una fecha despues. Por eso 'stop_date' se fija con 'now'.
if [ $last_date != $stop_date ]
then
last_date=`date -d "$last_date +1 days" '+%Y-%m-%d'`
## FUNCION 'obtain_days' CALCULA EL NUMERO DE DIAS ENTRE LA ULTIMA FECHA DE DESCARGA Y LA FECHA DE PARO
obtain_days $last_date $stop_date # parametros que se pasan a la funcion
num_days=$? # retorno del valor por parte de la funcion 'obtain_days'
for i in `seq $num_days -1 1`
do
y=`date -d "$stop_date - $i days" '+%Y'`
m=`date -d "$stop_date - $i days" '+%m'`
d=`date -d "$stop_date - $i days" '+%d'`
cd cawlersNoticias/$section/$site/ # ruta donde se encuentran alojados los crawlers de cada sitio
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
if [ -e $y-$m-$d.json ] # revisa si se genero el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ # ruta donde se guardaran los json generados
if [ ! -d $destination ] # si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination # mueve el archivo json a la ruta de destino
fi
cd ~/crawler
done
fi
done
done
deactivate
## ------------------------------------------------------------------------------------------------
\ No newline at end of file
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script para la descarga constante de histórico de medios del tipo "descarga_hacia_atras".
"""
import sys
import json
import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
else:
year = today.date().year
print year
try:
os.makedirs(str(year))
except:
print "ok"
os.chdir(str(year))
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
filename = "noticias.json"
if len(lstDays) > 0:
strDate = lstDays[len(lstDays)-1]
print strDate
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
else:
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else:
os.chdir(scrapyDir)
os.system("python3 parse_date_files.py " + s['crawler'] + " " + filename)
os.chdir(media)
mediaYears = os.listdir(".")
mediaYears.sort()
for yy in mediaYears:
os.chdir(yy)
try:
os.makedirs(baseDir + media + "/" + yy)
except:
pass
mediaDays = os.listdir(".")
mediaDays = [l for l in mediaDays if not l.startswith('.')]
mediaDays.sort()
for dd in mediaDays:
os.system("mv " + dd + " " + baseDir + media + "/" + yy)
os.chdir("..")
os.system("rm -R " + yy)
os.chdir("..")
os.system("rm -R " + media)
os.chdir(s['crawler'])
os.system("rm " + filename)
os.chdir(mydir)
os.chdir("..")
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script para la descarga constante de histórico de medios del tipo "descarga_por_dia".
"""
import sys
import json
import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
year = desde.year
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
for y in range(year, today.year+1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
if len(lstDays) > 0:
strDate = lstDays[len(lstDays)-1]
strDate = strDate[:strDate.find(".")]
currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
day = currentDate.timetuple().tm_yday
elif y != desde.year:
currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
day = 1
for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if today.year!=y else today.timetuple().tm_yday)+1):
filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else: os.system("mv " + filename + " " + mydir)
os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script para la descarga automatica de noticias por rss
"""
import sys
import json
import os
import datetime
from collections import OrderedDict
def dictRowGenerator(line):
row = []
try:
row.append(("date", line['date']))
except:
pass
try:
row.append(("topic", line['topic']))
except:
pass
try:
row.append(("title", line['title']))
except:
pass
try:
row.append(("author", line['author']))
except:
pass
try:
row.append(("location", line['location']))
except:
pass
try:
row.append(("text", line['text']))
except:
pass
try:
row.append(("url", line['url']))
except:
pass
return row
today = datetime.datetime.now()
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
# desde = datetime.datetime.strptime(s['desde'], '%d-%m-%Y')
desde = today
print str(s['nombre'] + ", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].rfind("/") + 1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
CORRECT_YEAR = False
while not CORRECT_YEAR:
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
element = lstYears[len(lstYears) - 1]
if element[-4:] == 'json':
os.system('rm ' + element)
else:
CORRECT_YEAR = True
else:
break
if CORRECT_YEAR: year = int(element)
else: year = desde.year
for y in range(year, today.year + 1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde.date()
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays) - 1]
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# day = currentDate.timetuple().tm_yday
# elif y != desde.year:
# currentDate = datetime.datetime.strptime(str(y) + "-01-01", '%Y-%m-%d')
# day = 1
for d in range(day, ((datetime.date(y, 12, 31) - datetime.date(y, 1, 1)).days + 1 if today.year != y else today.timetuple().tm_yday) + 1):
YESTERDAY = False
filename = currentDate.strftime('%Y-%m-%d') + ".json"
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir + s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3:
os.system("rm " + filename)
else:
f1 = mydir + "/" + filename
f2 = filename
f3 = baseDir + media + "/" + filename
try:
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
master = json.load(infile1)
slave = json.load(infile2)
urlSet = set([line['url'] for line in master])
counter = 0
infile3.write("[")
for line in master:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if lineDate == currentDate:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(",\n" + json.dumps(row))
for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if not line['url'] in urlSet and lineDate == currentDate:
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
infile3.write(",\n" + json.dumps(row))
elif lineDate != currentDate and (currentDate - lineDate).days == 1:
YESTERDAY = True
infile3.write("]")
os.system("mv " + f3 + " " + mydir)
# os.system("rm " + f2)
except:
os.system("cp " + f2 + " " + mydir)
if YESTERDAY:
currentDate -= datetime.timedelta(days=1)
filenameYesterday = currentDate.strftime('%Y-%m-%d') + ".json"
f1 = mydir + '/' + filenameYesterday
f2 = filename
f3 = baseDir + media + '/' + filenameYesterday
with open(f2) as infile2, open(f3, 'a') as infile3:
try:
infile1 = open(f1)
master = json.load(infile1)
yesterdayFlag = True
except:
yesterdayFlag = False
urlSet = set()
slave = json.load(infile2)
infile3.write("[")
if yesterdayFlag:
urlSet = set([line['url'] for line in master])
counter = 0
for line in master:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(",\n" + json.dumps(row))
counter = 0
for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d').date()
if not line['url'] in urlSet and lineDate == currentDate:
counter += 1
auxRow = dictRowGenerator(line)
row = OrderedDict(auxRow)
if not yesterdayFlag and counter == 1:
infile3.write(json.dumps(row))
else:
infile3.write(",\n" + json.dumps(row))
infile3.write("]")
if yesterdayFlag: infile1.close()
os.system("mv " + f3 + " " + mydir)
os.system("rm " + f2)
os.chdir(mydir)
if YESTERDAY:
currentDate += datetime.timedelta(days=2)
else:
currentDate += datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script para el trackeo de histórico de medios del tipo "descarga_hacia_atras".
"""
import sys
import json
import os
import datetime
today = datetime.datetime.now()
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
lstYears = os.listdir(".")
lstYears.sort()
if len(lstYears) > 0:
year = int(lstYears[len(lstYears)-1])
else:
year = today.date().year
print year
try:
os.makedirs(str(year))
except:
print "ok"
os.chdir(str(year))
lstDays = os.listdir(".")
lstDays = [l for l in lstDays if not l.startswith('.')]
lstDays.sort()
print lstDays
filename = "news.json"
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays)-1]
# print strDate
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year=" + str(currentDate.year) + " -a month=" + str(currentDate.month) + " -a day=" + str(currentDate.day)
#
# else:
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else:
os.chdir(scrapyDir)
os.system("python3 parse_date_files.py " + s['crawler'] + " " + filename)
os.chdir(media)
mediaYears = os.listdir(".")
mediaYears.sort()
for yy in mediaYears:
os.chdir(yy)
try:
os.makedirs(baseDir + media + "/" + yy)
except:
pass
mediaDays = os.listdir(".")
mediaDays = [l for l in mediaDays if not l.startswith('.')]
mediaDays.sort()
for dd in mediaDays:
os.system("mv " + dd + " " + baseDir + media + "/" + yy)
os.chdir("..")
os.system("rm -R " + yy)
os.chdir("..")
os.system("rm -R " + media)
os.chdir(s['crawler'])
# os.system("rm " + filename)
os.chdir(mydir)
os.chdir("..")
os.chdir("..")
# print today.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Descarga las noticias de un sitio desde entre dos fechas especificas para medios del tipo 'descarga_por_dia'.
USO:
tracker.py data.json
"""
import sys
import json
import os
import datetime
# today = datetime.datetime.now()
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
with open(sys.argv[1]) as data_file:
siteList = json.load(data_file)
os.chdir(baseDir)
for s in siteList:
desde = datetime.datetime.strptime(s['desde'], "%d-%m-%Y")
hasta = datetime.datetime.strptime(s['hasta'], "%d-%m-%Y")
print str(s['nombre'] +", desde:" + desde.strftime("%Y-%m-%d"))
media = s['crawler'][s['crawler'].rfind("/")+1:]
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
# lstYears = os.listdir(".")
# lstYears.sort()
year = desde.year
# if len(lstYears) > 0:
# year = int(lstYears[len(lstYears)-1])
for y in range(year, hasta.year+1):
print y
try:
os.makedirs(str(y))
except:
print "ok"
os.chdir(str(y))
# print os.getcwd()
# lstDays = os.listdir(".")
# lstDays = [l for l in lstDays if not l.startswith('.')]
# lstDays.sort()
# print lstDays
day = desde.timetuple().tm_yday
print day
currentDate = desde
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays)-1]
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# day = currentDate.timetuple().tm_yday
# elif y != desde.year:
if y != desde.year:
currentDate = datetime.datetime.strptime(str(y)+"-01-01", '%Y-%m-%d')
day = 1
for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if hasta.year!=y else hasta.timetuple().tm_yday)+1):
filename = currentDate.strftime('%Y-%m-%d')+".json"
scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir+s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else: os.system("mv " + filename + " " + mydir)
os.chdir(mydir)
currentDate = currentDate + datetime.timedelta(days=1)
os.chdir("..")
os.chdir("..")
# print hasta.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Script para el trackeo del medio PROCESO.
"""
import sys
import os
baseDir = "/home/geoint/M3NAS/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/"
s = {"crawler": "descarga_por_mes/proceso"}
media = s['crawler'][s['crawler'].rfind("/")+1:]
os.chdir(baseDir)
try:
os.makedirs(media)
except:
print "ok"
os.chdir(media)
# rango va de 1976 a 2018
for year in xrange(1976, 2019):
try:
os.makedirs(str(year))
except:
print "ok"
os.chdir(str(year))
filename = str(year) + ".json"
scrapycommand = "scrapy crawl noticias --nolog -s filename={0} -a year={1}".format(filename, str(year))
mydir = os.getcwd()
print mydir
os.chdir(scrapyDir + s['crawler'])
print media
print scrapycommand
os.system(scrapycommand)
fileSize = os.stat(filename).st_size
if fileSize <= 3: os.system("rm " + filename)
else:
os.chdir(scrapyDir)
sys_command = "python3 parse_date_files.py {0} {1}".format(s['crawler'], filename)
os.system(sys_command)
os.chdir(media)
mediaYears = os.listdir(".")
mediaYears.sort()
for yy in mediaYears:
os.chdir(yy)
try:
os.makedirs(baseDir + media + "/" + yy)
except:
pass
mediaDays = os.listdir(".")
mediaDays = [l for l in mediaDays if not l.startswith('.')]
mediaDays.sort()
for dd in mediaDays:
os.system("mv " + dd + " " + baseDir + media + "/" + yy)
os.chdir("..")
os.system("rm -R " + yy)
os.chdir("..")
os.system("rm -R " + media)
os.chdir(s['crawler'])
os.system("rm " + filename)
os.chdir(mydir)
os.chdir("..")
# os.chdir("..")
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class LajornadaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for laJornada project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'laJornada'
SPIDER_MODULES = ['laJornada.spiders']
NEWSPIDER_MODULE = 'laJornada.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'laJornada (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'laJornada.middlewares.LajornadaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'laJornada.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'laJornada.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = laJornada.settings
[deploy]
#url = http://localhost:6800/
project = laJornada
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = tribunaHn.settings
[deploy]
#url = http://localhost:6800/
project = tribunaHn
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class TribunahnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for tribunaHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tribunaHn'
SPIDER_MODULES = ['tribunaHn.spiders']
NEWSPIDER_MODULE = 'tribunaHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tribunaHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tribunaHn.middlewares.TribunahnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tribunaHn.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tribunaHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
===============================================================================
THIS VERSION OF La Tribuna Honduras IS NOW DEPRECATED SINCE THE SITE'S WEB PAGE
CHANGED ITS ACCESS TO ACCES BY DAY.
THE NEW VERSION CAN BE FOUNd IN THE descarga_por_dia/foraneos FOLDER.
===============================================================================
"""
import scrapy, re, json
from datetime import date
from tribunaHn.items import NoticiasItem
"""
MEDIO:
La Tribuna, Honduras
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=2 -a day=29
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'\d{4}\/\d{2}\/\d{2}')
class ImportantData(scrapy.Item):
section = scrapy.Field()
page = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
if self.year is not None and self.month is not None and self.day is not None:
self.stopDate = date(int(self.year), int(self.month), int(self.day))
else:
self.stopDate = None
baseURL = "http://www.latribuna.hn/"
sectionList = ["noticias", "honduras", "sociedad", "cafeteando", "dejenme-decirles", "desde-usa",
"ecomentarios", "el-cambio-climatico", "el-dossier-de-atenea", "enfoques",
"pecadillos-idiomaticos", "pildoritas", "columnistas", "editorial", "tribuna-del-pueblo",
"anales-historicos", "cine", "dejando-huellas", "dia-7", "dominicales", "done-un-aula",
"especiales-lt", "la-cobra-pregunta", "la-tribuna-agropecuaria", "la-tribuna-cultural",
"nuestro-orgullo", "turismo"]
# sectionList = ["noticias"]
for s in sectionList:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
def parse(self, response):
CONTINUE_SEARCHING = True
linkList = response.xpath('//div[@id="main"]').css('article.linkbox').xpath('./a[@itemprop="url"]/@href').extract()
linkList.extend(response.xpath('//div[@id="main"]').css('div.bottom-margin').css('div.col-sm-6').xpath('./h3/a[@itemprop="url"]/@href').extract())
if self.stopDate is None:
for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
for link in linkList:
res = DAT_RE.search(link)
if res:
dat = map(int, res.group(0).split("/"))
newsDate = date(dat[0], dat[1], dat[2])
if newsDate >= self.stopDate:
yield scrapy.Request(url=link, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
nextPage = response.xpath('//span[@class="next"]/a/@href').extract_first()
if nextPage is not None:
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_item(self, response):
item = NoticiasItem()
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//header/h1[@itemprop="name"]').extract_first())
try:
topic = response.xpath('//aside[@class="tags"]/ul/li/a/text()').extract()[0]
except:
topic = None
item['topic'] = topic
for p in response.css('div.article-post-content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
[]
\ No newline at end of file
[]
\ No newline at end of file
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = yucatanALaMano.settings
[deploy]
#url = http://localhost:6800/
project = yucatanALaMano
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class YucatanalamanoSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for yucatanALaMano project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'yucatanALaMano'
SPIDER_MODULES = ['yucatanALaMano.spiders']
NEWSPIDER_MODULE = 'yucatanALaMano.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'yucatanALaMano (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yucatanALaMano.middlewares.YucatanalamanoSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yucatanALaMano.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'yucatanALaMano.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from yucatanALaMano.items import NoticiasItem
"""
MEDIO:
Yucatán a la Mano, Yuc.
USO:
scrapy crawl noticias --nolog -s filename=2017-03-22.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
# self.baseURL = "http://www.yucatanalamano.com/" + year + "/" + month + "/" + day
# self.baseURL = "http://yucatanalamano.com/" + year + "/" + month + "/" + day
urlList = ["http://www.yucatanalamano.com/" + year + "/" + month + "/" + day,
"http://yucatanalamano.com/" + year + "/" + month + "/" + day]
for url in urlList:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(1, pages):
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page)
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
title = response.xpath('//div[@class="main-col"]/div[@itemprop="name"]/text()').extract_first()
if title is None:
title = response.xpath('//div[@class="main-col"]').css('h1').extract_first()
if title is not None:
item['title'] = remove_tags(title)
else:
item['title'] = title
d = response.css('div.mom-post-meta').css('span').css('time::attr(datetime)').extract_first()
## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico)
if d[-6:] != '-06:00':
d = d[:-6] + '-06:00'
item['date'] = d
item['topic'] = response.css('div.breadcrumbs-plus').css('span').css('a::attr(title)').extract_first()
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiariodechiapasSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiariodechiapasDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioDeChiapas project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioDeChiapas'
SPIDER_MODULES = ['diarioDeChiapas.spiders']
NEWSPIDER_MODULE = 'diarioDeChiapas.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioDeChiapas (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioDeChiapas.middlewares.DiariodechiapasSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioDeChiapas.middlewares.DiariodechiapasDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioDeChiapas.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Diario de Chiapas, Chiapas
USAGE
$ cd diarioDeChiapas/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-30.json -a year=2018 -a month=8 -a day=30
"""
import scrapy, re, json
from datetime import datetime, date
from diarioDeChiapas.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class ImportantData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
news_section = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
baseURL = "http://www.diariodechiapas.com/landing/"
section_list = ["editorial", "portada", "metropoli", "region", "la-roja",
"deportes", "boga", "ae", "trascendio"]
# section_list = ["editorial"]
if self.stopDate is None:
for s in section_list:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
else:
for s in section_list:
flow_info = ImportantData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse(self, response):
link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
section = response.xpath('//section[@class="wrapper"]/h1').extract_first()
if section is not None : section = remove_tags(section)
for link in link_list:
flow_info = ImportantData()
flow_info['news_section'] = section
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = flow_info
yield request
next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.xpath('//section[@class="page__content"]').css('section.post').xpath('./a[@class="post__link"]/@href').extract()
section = response.xpath('//section[@class="wrapper"]/h1').extract_first()
if section is not None : section = remove_tags(section)
for link in link_list:
flow_info = ImportantData()
flow_info['news_section'] = section
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
next_page = response.css('div.wp-pagenavi').css('a.nextpostslink').css('::attr(href)').extract_first()
if next_page is not None:
flow_info['to_next_page'] = False
request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
title = response.xpath('//section[@class="single__content"]/h1').extract_first()
if title is not None : title = remove_tags(title)
for p in response.xpath('//section[@class="single__content"]').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = flow_info['news_section']
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
news_date = news_date[:news_date.find('T')]
news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
if news_date >= self.stopDate:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
title = response.xpath('//section[@class="single__content"]/h1').extract_first()
if title is not None : title = remove_tags(title)
for p in response.xpath('//section[@class="single__content"]').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = flow_info['news_section']
item['title'] = title
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioDeChiapas.settings
[deploy]
#url = http://localhost:6800/
project = diarioDeChiapas
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiarioindependienteSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DiarioindependienteDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioIndependiente project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioIndependiente'
SPIDER_MODULES = ['diarioIndependiente.spiders']
NEWSPIDER_MODULE = 'diarioIndependiente.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioIndependiente (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioIndependiente.middlewares.DiarioindependienteSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioIndependiente.middlewares.DiarioindependienteDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioIndependiente.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
"""
MEDIA:
Diario El Independiente, Baja California Sur
USAGE:
$ cd elIndependiente/
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to the oldest. It's necessary to use the parse_date_files.py file
for the news contained in noticias.json being splitted into files by date. ##
$ scrapy crawl noticias --nolog -s filename=noticias.json
------------------------------------------------------------------------------------------------------------
## Get all the news from the most current to a specific date. ##
$ scrapy crawl noticias --nolog -s filename=2018-08-10.json -a year=2018 -a month=8 -a day=10
"""
import scrapy, re, json
from datetime import datetime, date
from diarioIndependiente.items import NoticiasItem
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class ImportantFlowData(scrapy.Item):
"""
Useful data for the flow of the implementation
"""
to_next_page = scrapy.Field()
is_last_link = scrapy.Field()
return_url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stop_date = date(int(year), int(month), int(day))
base_url = "https://www.diarioelindependiente.mx/" + year + "/" + month + "/"
else:
self.stop_date = None
section_list = ["la-paz", "los-cabos", "policiaca", "deportes", "cultura", "nacional",
"internacional", "opinion", "espectaculos", "tecnologia"]
base_url = "https://www.diarioelindependiente.mx/"
if self.stop_date is None:
for s in section_list:
yield scrapy.Request(url=base_url + s, callback=self.parse)
else:
flow_info = ImportantFlowData()
flow_info['to_next_page'] = False
request = scrapy.Request(url=base_url, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse(self, response):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
pagination = response.css('div.paginacion').xpath('./ul/li/a/@href').extract()
if len(pagination) > 0:
pagination = pagination[-2]
pages = int(pagination[pagination.rfind('=') + 1:])
for page in xrange(1, pages):
yield scrapy.Request(url=response.url + "?page=" + str(page + 1), callback=self.parse_page)
def parse_page(self, response):
link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
for link in link_list:
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_with_stop_date(self, response):
flow_info = response.meta['item']
if not flow_info['to_next_page']:
link_list = response.xpath('//div[@id="colNoticias"]').css('article.card__article').xpath('./h2/a/@href').extract()
for link in link_list:
flow_info = ImportantFlowData()
flow_info['return_url'] = response.url
if link == link_list[-1] : flow_info['is_last_link'] = True
else : flow_info['is_last_link'] = False
request = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
request.meta['item'] = flow_info
yield request
else:
next_page = response.css('div.paginacion').xpath('./ul/li/a[@rel="next"]/@href').extract_first()
if next_page is not None:
flow_info['to_next_page'] = False
request = scrapy.Request(url=next_page, callback=self.parse_with_stop_date)
request.meta['item'] = flow_info
yield request
def parse_item(self, response):
item = NoticiasItem()
text = ''
news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
try:
topic = response.xpath('//span[@class="badge"]').extract_first()
except:
topic = None
for p in response.css('div.cuerpo_noticia').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = datetime.strptime(news_date, '%Y-%m-%d').isoformat("T")
item['title'] = remove_tags(response.css('h1.colorRojo').extract_first())
item['topic'] = remove_tags(topic)
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
news_date = response.xpath('//meta[@name="date"]/@content').extract_first()
news_date = datetime.strptime(news_date, '%Y-%m-%d').date()
if news_date >= self.stop_date:
flow_info = response.meta['item']
item = NoticiasItem()
text = ''
news_date = datetime.strptime(news_date.isoformat(), '%Y-%m-%d').isoformat("T")
title = response.css('h1.colorRojo').extract_first()
if title is not None : title = remove_tags(title)
topic = response.xpath('//span[@class="badge"]').extract_first()
if topic is not None : topic = remove_tags(topic)
for p in response.css('div.cuerpo_noticia').css('p').extract():
text += remove_tags(p) + "\n"
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
if flow_info['is_last_link']:
flow_info['to_next_page'] = True
request = scrapy.Request(url=flow_info['return_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = flow_info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = diarioIndependiente.settings
[deploy]
#url = http://localhost:6800/
project = diarioIndependiente
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DiarioyucatanSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for diarioYucatan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'diarioYucatan'
SPIDER_MODULES = ['diarioYucatan.spiders']
NEWSPIDER_MODULE = 'diarioYucatan.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'diarioYucatan (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'diarioYucatan.middlewares.DiarioyucatanSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'diarioYucatan.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'diarioYucatan.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re
from diarioYucatan.items import NoticiasItem
# from datetime import datetime, date, timedelta
# from scrapy.spidermiddlewares.httperror import HttpError
"""
MEDIO:
Diario de Yucatán, Yuc.
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
de una fecha especifica.
USO:
scrapy crawl noticias --nolog -s filename=noticias.json
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes', 'mexico/quintana-roo',
'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud', 'editorial', 'multimedia']
# section_list = ['yucatan', 'salud', 'editorial', 'imagen',
# 'merida', 'merida/clima', 'merida/gobierno', 'merida/policia', 'merida/politica',
# 'mexico', 'mexico/quintana-roo', 'mexico/cdmx', 'mexico/economia', 'mexico/campeche',
# 'internacional', 'internacional/asia', 'internacional/europa', 'internacional/africa',
# 'internacional/america', 'internacional/oceania',
# 'deportes', 'deportes/futbol', 'deportes/nfl',
# 'espectaculos', 'espectaculos/cine', 'espectaculos/farandula', 'espectaculos/musica',
# 'espectaculos/tv-y-series',
# 'tecnologia', 'tecnologia/redes-sociales', 'tecnologia/innovaciones',
# 'multimedia', 'multimedia/fotos', 'multimedia/videos']
self.globalLinkSet = set()
self.baseURL = "http://www.yucatan.com.mx/seccion/"
self.parsing_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
for s in section_list:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
pathList = ['//*[@class="g1-column"]/div/div/article',
'//*[@class="g1-collection g1-collection-columns-2"]/div/ul/li/article']
for path in pathList:
for entry in response.xpath(path):
link = entry.css('h3').css('a::attr(href)').extract_first()
if not link in self.globalLinkSet:
self.globalLinkSet.add(link)
item = NoticiasItem()
item['topic'] = remove_tags(response.xpath('//h2[@class="g1-delta g1-delta-2nd resaltartitulo"]').extract_first())
request = scrapy.Request(url=link, callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
item['date'] = response.xpath('//time[@class="entry-date"]/@datetime').extract_first() + "-06:00"
item['title'] = remove_tags(response.xpath('//h1[@class="g1-mega g1-mega-1st entry-title"]').extract_first())
for p in response.xpath('//*[@itemprop="articleBody"]/p').extract():
text += remove_tags(p) + "\n"
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = diarioYucatan.settings
[deploy]
#url = http://localhost:6800/
project = diarioYucatan
[]
\ No newline at end of file
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ElfinancieroSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for elFinanciero project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'elFinanciero'
SPIDER_MODULES = ['elFinanciero.spiders']
NEWSPIDER_MODULE = 'elFinanciero.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'elFinanciero.middlewares.ElfinancieroSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'elFinanciero.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'elFinanciero.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
File added
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re, json
from elFinanciero.items import NoticiasItem
from collections import OrderedDict
from datetime import datetime, date, timedelta, tzinfo
"""
MEDIO:
El Financiero, CDMX
DESCARGA HACIA ATRÁS:
Este crawler no descarga las noticias de un día específico, sino que descarga las todas las noticias desde
la fecha más reciente hasta la fecha indicada con los parámetros 'year', 'month', 'day'
USO:
scrapy crawl noticias --nolog -s filename=2018-02-06.json -a year=2018 -a month=2 -a day=6
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
TIME = re.compile(r'\d{1,2}:\d{2} ?[ap]m')
PUB = re.compile(r'"publishedAt":.*?,')
class ImportantData(scrapy.Item):
section = scrapy.Field()
url1 = scrapy.Field()
url2 = scrapy.Field()
page = scrapy.Field()
res = scrapy.Field()
class SectionData(scrapy.Item):
section = scrapy.Field()
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para aguascalientes (centro de méxico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class QuotesSpider(scrapy.Spider):
name = "noticias"
newsSet = set()
def start_requests(self):
self.tz = UTC()
# year = getattr(self, "year", None)
# month = getattr(self, "month", None)
# day = getattr(self, "day", None)
# self.currentDate = date(int(year), int(month), int(day))
self.currentDate = datetime.now().date()
# self.currentDate = date(2018, 2, 23)
self.date_parser = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4,
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
sectionList = ["economia", "empresas", "nacional", "culturas",
"deportes", "mundo", "bajio", "tech", "ciencia"]
self.baseURL = "http://www.elfinanciero.com.mx/"
"""
Ejemplo de URL para las noticias de días anteriores para la sección Economía:
http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22economia%22}&type=page&page=2&size=10
"""
self.uri_base = "http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
self.uri_page = "%22}&type=page&page="
self.uri_complement = "&size=10"
print(self.uri_base+self.uri_page+self.uri_complement)
for s in sectionList:
yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
def parse(self, response):
print(response.url)
searchData = ImportantData()
CONTINUE_SEARCHING = True
section = response.url[response.url.rfind("/") + 1:]
for link in response.css('div.is-multiline').css('div.column-box').xpath('./a/@href').extract():
if link.find("/") == 0:
link = link[1:]
yield scrapy.Request(url=self.baseURL + link, callback=self.parse_item)
newsLinkList = response.xpath('//div[@class="column feed"]/a/@href').extract()
newsDateList = response.xpath('//div[@class="column feed"]').css('p.date-time::text').extract()
postDict = OrderedDict(zip(newsLinkList, newsDateList))
for uri in postDict.keys():
dt = postDict[uri]
res = TIME.match(dt)
if res:
postDate = datetime.now().date()
else:
postDate = datetime.strptime(dt, "%d/%m/%Y").date()
if postDate >= self.currentDate:
if uri.find("/") == 0:
uri = uri[1:]
yield scrapy.Request(url=self.baseURL + uri, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
page = 2
url = self.uri_base + section + self.uri_page + str(page) + self.uri_complement
searchData['section'] = section
searchData['page'] = page
request = scrapy.Request(url=url, callback=self.continue_searching, dont_filter=True)
request.meta['item'] = searchData
yield request
def continue_searching(self, response):
CONTINUE_SEARCHING = True
searchData = response.meta['item']
REG_EXPR = re.compile(r'"' + re.escape(searchData['section']) + r'\\/.*?"')
lList = REG_EXPR.findall(response.body)
pList = PUB.findall(response.body)
linkList = [l.replace("\\", '').replace('"', '') for l in lList]
isodateList = [d[d.find(":")+1:].replace('"', '').replace(",", '') for d in pList]
postDict = OrderedDict(zip(linkList, [iso[:iso.find("T")] for iso in isodateList]))
for uri in postDict.keys():
dt = postDict[uri]
res = TIME.match(dt)
if res:
postDate = datetime.now().date()
else:
postDate = datetime.strptime(dt, "%Y-%m-%d").date()
if postDate >= self.currentDate:
if uri.find("/") == 0:
uri = uri[1:]
yield scrapy.Request(url=self.baseURL + uri, callback=self.parse_item)
else:
CONTINUE_SEARCHING = False
break
if CONTINUE_SEARCHING:
searchData['page'] += 1
url = self.uri_base + searchData['section'] + self.uri_page + str(searchData['page']) + self.uri_complement
request = scrapy.Request(url=url, callback=self.continue_searching)
request.meta['item'] = searchData
yield request
def parse_item(self, response):
if not response.url in self.newsSet:
self.newsSet.add(response.url)
item = NoticiasItem()
text = ''
res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
resDict = json.loads(res)
dt = resDict['datePublished']
d, t = dt.split()
d = map(int, d.split("-"))
t = map(int, t.split(":"))
dat = date(d[0], d[1], d[2])
if dat >= self.currentDate:
item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
topic = response.xpath('//div[@class="section-line"]').extract_first()
if topic is not None:
item['topic'] = remove_tags(topic)
else:
item['topic'] = None
author = response.xpath('//div[@class="note-author"]/a').extract_first()
if author is not None:
item['author'] = remove_tags(author)
for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + '\n'
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
item['text'] = text.strip()
item['url'] = response.url
yield item
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = elFinanciero.settings
[deploy]
#url = http://localhost:6800/
project = elFinanciero
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class AmandalaSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for amandala project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'amandala'
SPIDER_MODULES = ['amandala.spiders']
NEWSPIDER_MODULE = 'amandala.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'amandala (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'amandala.middlewares.AmandalaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'amandala.middlewares.MyCustomDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'amandala.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
# -*- coding: utf-8 -*-
import scrapy, re, json
from datetime import datetime, date
from amandala.items import NoticiasItem
"""
MEDIO:
Amandala, Belice
USO:
// Si se quiere obtener todas las noticias desde las más actuales hasta las más antiguas. //
scrapy crawl noticias --nolog -s filename=noticias.json
-------------------------------------------------------------------------------------------------
// Si se quiere obtener todas las noticias desde las más actuales hasta una fecha específica. //
scrapy crawl noticias --nolog -s filename=noticias.json -a year=2018 -a month=3 -a day=5
-------------------------------------------------------------------------------------------------
Después será necesario hacer uso del archivo parse_date_files.py para que las noticias contenidas
en noticias.json sean separadas en archivos por fecha.
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
DAT_RE = re.compile(r'\d{4}\/\d{2}\/\d{2}')
class ImportantData(scrapy.Item):
CONTINUE_SEARCHING = scrapy.Field()
LAST_LINK = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
if year is not None and month is not None and day is not None:
self.stopDate = date(int(year), int(month), int(day))
else:
self.stopDate = None
baseURL = "http://amandala.com.bz/news/category/"
sectionList = ["headline", "highlights", "general", "features",
"sports", "letters", "editorial", "publisher"]
# sectionList = ["headline"]
if self.stopDate is None:
for s in sectionList:
yield scrapy.Request(url=baseURL + s, callback=self.parse)
else:
for s in sectionList:
info = ImportantData()
info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s, callback=self.parse_with_stop_date)
request.meta['item'] = info
yield request
def parse(self, response):
linkList = response.xpath('//div[@id="list_categories"]').css('div.content').css('h3').css('a::attr(href)').extract()
for link in linkList:
yield scrapy.Request(url=link, callback=self.parse_item)
nextPage = response.xpath('//div[@id="entries"]').css('div.old_entries').css('a::attr(href)').extract_first()
if nextPage is not None:
yield scrapy.Request(url=nextPage, callback=self.parse)
def parse_with_stop_date(self, response):
searchData = response.meta['item']
CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
if not CONTINUE_SEARCHING:
linkList = response.xpath('//div[@id="list_categories"]').css('div.content').css('h3').css('a::attr(href)').extract()
for link in linkList:
info = ImportantData()
info['url'] = response.url
if link == linkList[-1]: info['LAST_LINK'] = True
else: info['LAST_LINK'] = False
reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
reqst.meta['item'] = info
yield reqst
else:
nextPage = response.xpath('//div[@id="entries"]').css('div.old_entries').css('a::attr(href)').extract_first()
if nextPage is not None:
searchData['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=nextPage, callback=self.parse_with_stop_date)
request.meta['item'] = searchData
yield request
def parse_item(self, response):
item = NoticiasItem()
text = ''
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['date'] = datetime.strptime(d, '%Y-%m-%d').isoformat("T")
item['title'] = remove_tags(response.xpath('//div[@class="active"]/h1/a').extract_first())
try:
topic = response.css('div.date').css('span.date').css('a::text').extract()[0]
except:
topic = None
item['topic'] = topic
for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
def parse_item_with_stop_date(self, response):
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
dt = datetime.strptime(d, '%Y-%m-%d').date()
if dt >= self.stopDate:
info = response.meta['item']
item = NoticiasItem()
text = ''
item['date'] = datetime.strptime(d, '%Y-%m-%d').isoformat("T")
item['title'] = remove_tags(response.xpath('//div[@class="active"]/h1/a').extract_first())
try:
topic = response.css('div.date').css('span.date').css('a::text').extract()[0]
except:
topic = None
item['topic'] = topic
for p in response.css('div.content').css('p').extract():
text += remove_tags(p) + "\n"
item['text'] = text.strip()
item['url'] = response.url
yield item
if info['LAST_LINK']:
info['CONTINUE_SEARCHING'] = True
request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = info
yield request
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = amandala.settings
[deploy]
#url = http://localhost:6800/
project = amandala
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class NoticiasItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class HeraldohnSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class HeraldohnDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item
# -*- coding: utf-8 -*-
# Scrapy settings for heraldoHn project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'heraldoHn'
SPIDER_MODULES = ['heraldoHn.spiders']
NEWSPIDER_MODULE = 'heraldoHn.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'heraldoHn (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'heraldoHn.middlewares.HeraldohnSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'heraldoHn.middlewares.HeraldohnDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'heraldoHn.pipelines.JsonWriterPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment