Commit c69e5696 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers update

parent 2b04c432
#!/bin/bash
for y in `seq 2016 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 26)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2005 2015`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -40,7 +40,7 @@ class QuotesSpider(scrapy.Spider):
self.data_list = []
self.baseURL = 'http://jornadabc.mx'
# section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura']
section_list = ['baja-california', 'chihuahua', 'mexico', 'mundo', 'cultura', 'espectaculos', 'deportes']
for section in section_list:
self.section = section
......
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
import scrapy
import json
from datetime import datetime, date, timedelta
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
# self.found = False
# self.flag = False
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.req_date = date(int(self.year), int(self.month), int(self.day))
self.date_format = "%Y-%m-%d"
self.baseURL = 'https://www.lajornadamaya.mx'
section_list = ['yucatan', 'quintana-roo', 'campeche', 'deportes', 'nacional',
'internacional', 'opinion']
# section_list = ['deportes']
for section in section_list:
self.section = section
for count in range(0,2):
if ( count == 0 ):
yield scrapy.Request(url=self.baseURL+'/'+section, callback=self.parse_2)
elif (count == 1):
# self.section = section
self.page = 0
self.flag = False
self.found = False
page = -1
if not ( section == 'opinion' ):
while True:
if ( self.flag ):
self.flag = False
break
page+=1
yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(page), callback=self.parse)
if ( self.found ):
self.found = False
self.page -= 1
if ( self.page > 0 ):
self.page -= 1
for pag in range(self.page, self.page+6):
yield scrapy.Request(url=self.baseURL+'/'+section+'?p='+str(pag), callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/notas?opinion', callback=self.parse_page)
def parse_2(self, response): # para las primeras noticias
path_list = ['//h1[@class="title"]/a/@href', '//h2[@class="title"]/a/@href']
link_list = []
for path in path_list:
link_list += response.xpath(path).extract()
for link in link_list:
if ( link[:link.rfind('/')] == self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) ):
item = NoticiasItem()
item['date'] = link[:link.rfind('/')]
item['topic'] = response.url[response.url.rfind('/')+1:].title()
# yield scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request = scrapy.Request(url=self.baseURL+'/'+link, callback=self.parse_item_2)
request.meta['item'] = item
yield request
def parse(self, response): # para los json
json_response = json.loads(response.text)
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
json_list = json_response
else:
json_list = json_response['articles']
for line in json_list:
this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
this_date = this_date.date()
if ( this_date == self.req_date ):
self.page = int(response.url[response.url.rfind('=')+1:])
self.found = True
self.flag = True
break
elif ( this_date < self.req_date ):
self.flag = True
break
def parse_item_2(self, response): # para las primeras noticias
item = response.meta['item']
# item = NoticiasItem()
text = ''
# item['date'] = response.url[:response.url.rfind('/')][response.url[:response.url.rfind('/')].rfind('/')+1:]
# item['topic'] = self.section.title()
item['title'] = response.xpath('//article/h1/text()').extract_first()
for paragraph in response.xpath('//*[@class="txt"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
yield item
def parse_page(self, response): # para los json
json_response = json.loads(response.text)
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
topic = response.url[response.url.rfind('/')+1:response.url.rfind('=')-2].title()
json_list = json_response
else:
json_list = json_response['articles']
topic = 'Opinion'
for line in json_list:
this_date = datetime.strptime(line['publishDate'][:line['publishDate'].rfind(' ')], self.date_format)
this_date = this_date.date()
if ( this_date == self.req_date ):
item = NoticiasItem()
item['date'] = line['publishDate']
item['topic'] = topic
item['title'] = line['name']
if not ( response.url[response.url.rfind('/')+1:] == 'notas?opinion' ):
request = scrapy.Request(url=self.baseURL+line['url'], callback=self.parse_item)
else:
request = scrapy.Request(url=self.baseURL+'/'+line['publishDate'][:line['publishDate'].rfind(' ')]+'/'+line['uriComponent'], callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response): # para los json
item = response.meta['item']
text = ''
for paragraph in response.xpath('//*[@class="txt"]').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
print item['title']
yield item
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28 )
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 2 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 20)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 6 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -24,13 +24,13 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
DOWNLOAD_DELAY=2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
......
......@@ -5,48 +5,138 @@ import scrapy
import re
from datetime import datetime, date, timedelta
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
# class QuotesSpider(scrapy.Spider):
# name = "noticias"
# def start_requests(self):
# year = getattr(self, 'year', None)
# month = getattr(self, 'month', None)
# day = getattr(self, 'day', None)
# self.baseURL='http://yucatan.com.mx/'+year+'/'+month+'/'+day
# urls = [
# self.baseURL,
# ]
# for url in urls:
# yield scrapy.Request(url=url, callback=self.parse)
# def parse(self, response):
# pagination = response.css('div.pagination').css('a::attr(href)').extract()
# if ( len(pagination) > 0 ):
# pagination = pagination[-1]
# pages = int(pagination[pagination.rfind('/')+1:])
# for page in range(0,pages):
# if ( page == 0 ):
# yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
# else:
# yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
# else:
# yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
# def parse_page(self, response):
# for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
# yield scrapy.Request(url=link, callback=self.parse_item)
# def parse_item(self, response):
# text = ''
# item = NoticiasItem()
# item['title'] = response.css('h1.entry-title::text').extract_first()
# item['date'] = response.css('div.base-box').css('span.entry-date::attr(datetime)').extract_first()
# for paragraph in response.css('div.entry-content').css('p').extract():
# text += remove_tags(paragraph) + '\n'
# item['text'] = text
# item['topic'] = response.xpath('//span[@itemprop="title"]/text()').extract()
# item['url'] = response.url
# # print item['title']
# yield item
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
section_list = ['merida', 'yucatan', 'mexico', 'internacional', 'deportes',
'espectaculos', 'imagen', 'economia', 'tecnologia', 'salud']
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://yucatan.com.mx/'+year+'/'+month+'/'+day
self.baseURL='http://yucatan.com.mx/seccion/'
self.date = date(int(year), int(month), int(day))
self.parsing_month = { 'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12 }
self.stop = False
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
]
for s in section_list:
yield scrapy.Request(url=self.baseURL+s, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').css('a::attr(href)').extract()
if ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) > 0 ):
for link in response.xpath('//a[@class="show-more-link"]/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_pagination)
elif ( len(response.xpath('//a[@class="show-more-link"]/@href').extract()) == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_pagination, dont_filter=True)
def parse_pagination(self, response):
pagination = response.xpath('//*[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1]
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
p = 1
while p <= pages:
if ( self.stop ):
p = pages+1
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
if ( p == 1 ):
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
elif ( p > 1 ):
yield scrapy.Request(url=response.url+'/page/'+str(p+1), callback=self.parse_link)
p += 1
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
yield scrapy.Request(url=response.url, callback=self.parse_link, dont_filter=True)
def parse_link(self, response):
for entry in response.xpath('//*[@class="bp-entry"]'):
entry_date = entry.xpath('./*[@class="bp-head"]/div/span/text()').extract_first()
entry_date = entry_date[entry_date.rfind(',')+2:][:entry_date[entry_date.rfind(',')+2:].rfind('-')-2]
news_date = date(int(entry_date[-4:]), self.parsing_month[entry_date[:-8][entry_date[:-8].rfind(' ')+1:]], int(entry_date[:1]))
link = entry.xpath('./*[@class="bp-head"]/h2/a/@href').extract_first()
if news_date == self.date and link is not None:
yield scrapy.Request(url=link, callback=self.parse_item)
elif news_date < self.date:
self.stop = True
def parse_page(self, response):
for link in response.css('div.bp-head').css('h2').css('a::attr(href)').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
......@@ -56,7 +146,7 @@ class QuotesSpider(scrapy.Spider):
for paragraph in response.css('div.entry-content').css('p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['topic'] = response.xpath('//span[@itemprop="title"]/text()').extract()
item['topic'] = [response.xpath('//*[@class="breadcrumbs-plus"]/span/a/span/text()').extract()[1]]
item['url'] = response.url
# print item['title']
yield item
......
#!/bin/bash
section=sitios_yucatan
site=diarioYucatan
for y in `seq 2012 2012`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 7 7)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 31 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
if [ -e $y-$m-$d.json ] #revisa si se generó el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ #ruta donde se guardarán los json generados
if [ ! -d $destination ] #si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination #mueve el archivo json a la ruta de destino
fi
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
if [ -e $y-$m-$d.json ] #revisa si se generó el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ #ruta donde se guardarán los json generados
if [ ! -d $destination ] #si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination #mueve el archivo json a la ruta de destino
fi
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
if [ -e $y-$m-$d.json ] #revisa si se generó el archivo json con las noticias
then
destination=$HOME/crawler/prueba/$section/$site/$y/ #ruta donde se guardarán los json generados
if [ ! -d $destination ] #si no existe la ruta de destino la crea
then
mkdir -p $destination
fi
mv -f $y-$m-$d.json $destination #mueve el archivo json a la ruta de destino
fi
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 4 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 7 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 6)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......
import scrapy
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.baseURL='http://laverdadnoticias.com/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.css("div.pagination_div").css("a::attr(href)").extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
item = NoticiasItem()
for post in response.css('div.col-md-12').css('div.cp-post-content'):
item['topic'] = post.css('ul.cp-post-tools').css('li').css('a::attr(title)').extract()
item['author'] = post.css('ul.cp-post-tools').xpath('./li[2]/text()').extract_first()
request = scrapy.Request(url=post.xpath('./h3/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
item = response.meta['item']
text = ''
item['title'] = response.css("div.cp-post-content").css("h1").css("a::text").extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
for paragraph in response.css("div.cp-post-content").css("p").extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 4 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 3)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2015 2015`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 8 8)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 2 22)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json # revisa si el archivo contiene informacion, sino lo elimina
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
[ -s $y-$m-$d.json ] || rm $y-$m-$d.json
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
......@@ -24,13 +24,13 @@ NEWSPIDER_MODULE = 'yucatanAlMinuto.spiders'
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY=3
DOWNLOAD_DELAY=2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN=16
#CONCURRENT_REQUESTS_PER_IP=16
# Disable cookies (enabled by default)
#COOKIES_ENABLED=False
COOKIES_ENABLED=False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED=False
......
import scrapy
from datetime import datetime, date
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22
......@@ -9,13 +10,14 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -24,17 +26,20 @@ class QuotesSpider(scrapy.Spider):
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.yucatanalminuto.com'
self.date = date(int(self.year), int(self.month), int(self.day))
urls = [
self.baseURL,
]
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for menu in response.css("center").css("td").css("a::attr(href)").extract():
if ( menu != '/' ):
yield scrapy.Request(url=response.url+menu, callback=self.parse_pagination)
def parse_pagination(self, response):
pagination = response.xpath('//div[@id="color_seccion"]/div[1]/p/text()').extract()
if ( len(pagination) > 0 ):
......@@ -48,17 +53,22 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
item = NoticiasItem()
# for post in response.xpath('//a[@class="lista_noticia"]'):
for post in response.xpath('//a[@class="lista_noticia"]'):
date = post.css('div.pad5').css('p.sec_autor2::text').extract_first()
if ( self.year+'-'+self.month+'-'+self.day == date[21:31] ):
item['date'] = date[21:]
news_date = datetime.strptime(date[21:31], '%Y-%m-%d').date()
# if ( self.year+'-'+self.month+'-'+self.day == date[21:31] ):
if ( news_date == self.date ):
item['date'] = news_date
item['topic'] = response.xpath('//div[@id="color_seccion"]/h1/text()').extract_first()
request = scrapy.Request(url=self.baseURL+post.css('::attr(href)').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
text = ''
item = response.meta['item']
......
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2017 2017`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 4 4)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 18 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment