crawlers

d895ebe3 · Renán Sosa Guillen · 3885bd5c · d895ebe3 · d895ebe3 · d895ebe3
Commit d895ebe3 authored Dec 08, 2017 by Renán Sosa Guillen
3 changed files
--- a/descarga_por_dia/alChile/alChile/spiders/noticias.py
+++ b/descarga_por_dia/alChile/alChile/spiders/noticias.py
 import scrapy, re

-
-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-
+"""
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
 import scrapy, re
 from datetime import datetime, timedelta, tzinfo

-## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""
+scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -46,12 +48,12 @@ class QuotesSpider(scrapy.Spider):
    def parse(self, response):
        pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()

-		if ( len(pagination) > 0 ):
+        if len(pagination) > 0:
            pagination = pagination[-1].strip('/')
            pages = int(pagination[pagination.rfind('/')+1:])

            for page in range(0, pages):
-				if ( page == 0 ):
+                if page == 0:
                    yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

                else:

--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc