Commit c69e5696 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers update

parent 2b04c432
#!/bin/bash
for y in `seq 2016 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 26)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2005 2015`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -40,7 +40,7 @@ class QuotesSpider(scrapy.Spider):
self.data_list = []
self.baseURL = 'http://jornadabc.mx'
# section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura', 'espectaculos', 'deportes']
for section in section_list:
self.section = section
......
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
import scrapy
import json
from datetime import datetime, date, timedelta
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
# self.found = False
# self.flag = False
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.req_date = date(int(self.year), int(self.month), int(self.day))
self.date_format = "%Y-%m-%d"
self.baseURL = 'https://www.lajornadamaya.mx'
section_list = ['yucatan', 'quintana-roo', 'campeche', 'deportes', 'nacional',
'internacional', 'opinion']
# section_list = ['deportes']
for section in section_list:
self.section = section
for count in range(0,2):
if ( count == 0 ):
yield scrapy.Request(url=self.baseURL+'/'+section, callback=self.parse_2)
elif (count == 1):
# self.section = section
self.page = 0
self.flag = False
self.found = False
page = -1
if not ( section == 'opinion' ):
while True:
if ( self.flag ):
self.flag = False
break
page+=1
yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(page), callback=self.parse)
if ( self.found ):
self.found = False
self.page -= 1
if ( self.page > 0 ):
self.page -= 1
for pag in range(self.page, self.page+6):
yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(pag), callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/notas?opinion', callback=self.parse_page)
def parse_2(self, response): # para las primeras noticias
path_list = ['//h1[@class="title"]/a/@href', '//h2[@class="title"]/a/@href']
link_list = []
for path in path_list:
link_list += response.xpath(path).extract()
for link in link_list:
if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
item = NoticiasItem()
item['date'] = link[:link.rfind('/')]
item['topic'] = response.url[response.url.rfind('/')+1:].title()
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request.meta['item'] = item
yield request
def parse(self, response): # para los json
json_response = json.loads(response.text)
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
json_list = json_response
else:
json_list = json_response['articles']
for line in json_list:
this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
this_date = this_date.date()
if ( this_date == self.req_date ):
self.page = int(response.url[response.url.rfind('=')+1:])
self.found = True
self.flag = True
break
elif ( this_date < self.req_date ):
self.flag = True
break
def parse_item_2(self, response): # para las primeras noticias
item = response.meta['item']
# item = NoticiasItem()
text = ''
# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
# item['topic'] = self.section.title()
item['title'] = response.xpath('//article/h1/text()').extract_first()
for paragraph in response.xpath('//*[@class="txt"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
yield item
def parse_page(self, response): # para los json
json_response = json.loads(response.text)
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
topic = response.url[response.url.rfind('/')+1:response.url.rfind('=')-2].title()
json_list = json_response
else:
json_list = json_response['articles']
topic = 'Opinion'
for line in json_list:
this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
this_date = this_date.date()
if ( this_date == self.req_date ):
item = NoticiasItem()
item['date'] = line['publishDate']
item['topic'] = topic
item['title'] = line['name']
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
request = scrapy.Request(url=self.baseURL+line['url'], callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+'/'+line['publishDate'][:line['publishDate'].rfind(' ')]+'/'+line['uriComponent'], callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response): # para los json
item = response.meta['item']
text = ''
for paragraph in response.xpath('//*[@class="txt"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
yield item
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28 )
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 2 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 20)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 6 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -24,13 +24,13 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
DOWNLOAD_DELAY=2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
......
......@@ -5,48 +5,138 @@ import scrapy
import re
from datetime import datetime, date, timedelta
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# class QuotesSpider(scrapy.Spider):
# name = "noticias"
# def start_requests(self):
# year = getattr(self, 'year', None)
# month = getattr(self, 'month', None)
# day = getattr(self, 'day', None)
# self.baseURL='http://yucatan.com.mx/'+year+'/'+month+'/'+day
# urls = [
# self.baseURL,
# ]
# for url in urls:
# yield scrapy.Request(url=url, callback=self.parse)
# def parse(self, response):
# pagination = response.css('div.pagination').css('a::attr(href)').extract()
# if ( len(pagination) > 0 ):
# pagination = pagination[-1]
# pages = int(pagination[pagination.rfind('/')+1:])
# for page in range(0,pages):
# if ( page == 0 ):
# yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
# else:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
# else:
# yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
# def parse_page(self, response):
# for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
# yield scrapy.Request(url=link, callback=self.parse_item)
# def parse_item(self, response):
# text = ''
# item = NoticiasItem()
# item['title'] = response.css('h1.entry-title::text').extract_first()
# item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
# for paragraph in response.css('div.entry-content').css('p').extract():
# text += remove_tags(paragraph) + '\n'
# item['text'] = text
# item['topic'] = response.xpath('//span[@itemprop="title"]/text()').extract()
# item['url'] = response.url
# # print item['title']
# yield item
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://yucatan.com.mx/'+year+'/'+month+'/'+day
self.baseURL='http://yucatan.com.mx/seccion/'
self.date = date(int(year), int(month), int(day))
self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
self.stop = False
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
]
for s in section_list:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_pagination)
elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
def parse_pagination(self, response):
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1]
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
p = 1
while p <= pages:
if ( self.stop ):
p = pages+1
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
if ( p == 1 ):
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
elif ( p > 1 ):
yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
p += 1
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
def parse_link(self, response):
for entry in response.xpath('//*[@class="bp-entry"]'):
entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:1]))
link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
if news_date == self.date and link is not None:
yield scrapy.Request(url=link, callback=self.parse_item)
elif news_date < self.date:
self.stop = True
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
......@@ -56,7 +146,7 @@ class QuotesSpider(scrapy.Spider):
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['topic'] = response.xpath('//span[@itemprop="title"]/text()').extract()
item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]]
item['url'] = response.url
# print item['title']
yield item
......
#!/bin/bash
section=sitios_yucatan
site=diarioYucatan
for y in `seq 2012 2012`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 7 7)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 31 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
if [ -e $y-$m-$d.json ] #revisa si se generó el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ #ruta donde se guardarán los json generados
if [ ! -d $destination ] #si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination #mueve el archivo json a la ruta de destino
fi
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
if [ -e $y-$m-$d.json ] #revisa si se generó el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ #ruta donde se guardarán los json generados
if [ ! -d $destination ] #si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination #mueve el archivo json a la ruta de destino
fi
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
if [ -e $y-$m-$d.json ] #revisa si se generó el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ #ruta donde se guardarán los json generados
if [ ! -d $destination ] #si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination #mueve el archivo json a la ruta de destino
fi
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 4 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 7 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 6)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......
import scrapy
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://laverdadnoticias.com/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.css("div.pagination_div").css("a::attr(href)").extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
item = NoticiasItem()
for post in response.css('div.col-md-12').css('div.cp-post-content'):
item['topic'] = post.css('ul.cp-post-tools').css('li').css('a::attr(title)').extract()
item['author'] = post.css('ul.cp-post-tools').xpath('./li[2]/text()').extract_first()
request = scrapy.Request(url=post.xpath('./h3/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
item['title'] = response.css("div.cp-post-content").css("h1").css("a::text").extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
for paragraph in response.css("div.cp-post-content").css("p").extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 4 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 3)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment