Commit 2f7d1d20 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Crawlers.

parent 73a2dbe5
...@@ -43,7 +43,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,7 +43,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+'/page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......
...@@ -36,7 +36,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -36,7 +36,7 @@ class QuotesSpider(scrapy.Spider):
'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes'] 'capital', 'sociedad', 'cultura', 'espectaculos', 'deportes']
for section in section_list: for section in section_list:
# para la fecha 2009.02.15 o menores, se tiene una estructura determinada de página # para la fecha 2009/02/15 o menores, se tiene una estructura determinada de pagina
# para las fechas mayores a esa la estructura cambia # para las fechas mayores a esa la estructura cambia
if ( requested_date <= comparison_date ): if ( requested_date <= comparison_date ):
yield scrapy.Request(url=self.baseURL+section, callback=self.parse) yield scrapy.Request(url=self.baseURL+section, callback=self.parse)
...@@ -47,7 +47,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -47,7 +47,7 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
if ( response.url[:response.url.rfind('/')+1] == self.baseURL ): # verifica que se conserva la misma URL base if ( response.url[:response.url.rfind('/')+1] == self.baseURL ): # verifica que se conserva la misma URL base
section = response.url[response.url.rfind('/')+1:] section = response.url[response.url.rfind('/')+1:]
if ( section == 'opinion' ): # la sección 'opinión' tiene una estructura diferente a las demás if ( section == 'opinion' ): # la seccion 'opinion' tiene una estructura diferente a las otras
path_list = ['//*[@id="columnas"]/p/a/@href', path_list = ['//*[@id="columnas"]/p/a/@href',
'//*[@id="opinion"]/p/a/@href'] '//*[@id="opinion"]/p/a/@href']
else: else:
......
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash #!/bin/bash
for y in `seq 2017 2017`; for y in `seq 2016 2016`;
do do
if [ ! -d $y ]; then if [ ! -d $y ]; then
mkdir -p $y; mkdir -p $y;
fi fi
cd $y cd $y
for m in $(seq -f "%02g" 3 4) for m in $(seq -f "%02g" 1 12)
do do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31) for d in $(seq -f "%02g" 1 31)
...@@ -22,7 +22,7 @@ do ...@@ -22,7 +22,7 @@ do
done done
fi fi
if [ $m -eq 2 ]; then if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28) for d in $(seq -f "%02g" 1 29)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done done
......
#!/bin/bash #!/bin/bash
for y in `seq 2017 2017`; for y in `seq 2016 2016`;
do do
if [ ! -d $y ]; then if [ ! -d $y ]; then
mkdir -p $y; mkdir -p $y;
fi fi
cd $y cd $y
for m in $(seq -f "%02g" 1 4) for m in $(seq -f "%02g" 1 12)
do do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31) for d in $(seq -f "%02g" 1 31)
...@@ -22,7 +22,7 @@ do ...@@ -22,7 +22,7 @@ do
done done
fi fi
if [ $m -eq 2 ]; then if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 28) for d in $(seq -f "%02g" 1 29)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done done
......
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
...@@ -43,7 +43,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,7 +43,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+'/page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
This diff is collapsed.
#!/bin/bash
for y in `seq 2016 2016`;
do
if [ ! -d $y ]; then
mkdir -p $y;
fi
cd $y
for m in $(seq -f "%02g" 1 12)
do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 30)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
if [ $m -eq 2 ]; then
for d in $(seq -f "%02g" 1 29)
do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done
fi
done
cd ..
done
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -7,7 +7,7 @@ do ...@@ -7,7 +7,7 @@ do
mkdir -p $y; mkdir -p $y;
fi fi
cd $y cd $y
for m in $(seq -f "%02g" 1 4) for m in $(seq -f "%02g" 4 4)
do do
if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then if [ $m -eq 1 -o $m -eq 3 -o $m -eq 5 -o $m -eq 7 -o $m -eq 8 -o $m -eq 10 -o $m -eq 12 ]; then
for d in $(seq -f "%02g" 1 31) for d in $(seq -f "%02g" 1 31)
...@@ -16,7 +16,7 @@ do ...@@ -16,7 +16,7 @@ do
done done
fi fi
if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then if [ $m -eq 4 -o $m -eq 6 -o $m -eq 9 -o $m -eq 11 ]; then
for d in $(seq -f "%02g" 1 17) for d in $(seq -f "%02g" 18 30)
do do
scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d
done done
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment