Commit 604d6176 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Se corrigió lectura de primera página en spider/noticias.py del diarioYucatan

parent 5d1ef2b3
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import scrapy import scrapy
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 month=12 day=24 #scrapy crawl noticias -t json --nolog -o noticias.json -a year=2016 -a month=12 -a day=24
import re import re
...@@ -35,7 +35,10 @@ class QuotesSpider(scrapy.Spider): ...@@ -35,7 +35,10 @@ class QuotesSpider(scrapy.Spider):
pages = response.css("div.pagination").css("a::attr(href)")[-1].extract() pages = response.css("div.pagination").css("a::attr(href)")[-1].extract()
pages = int(pages[pages.rfind('/')+1:]) pages = int(pages[pages.rfind('/')+1:])
for p in range(0,pages): for p in range(0,pages):
yield scrapy.Request(url=response.url+"/page/"+str(p+1), callback=self.parse_page) if ( p == 0 ):
yield scrapy.Request(url=response.url+"/page/"+str(p+1), callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=response.url+"/page/"+str(p+1), callback=self.parse_page)
def parse_page(self, response): def parse_page(self, response):
for link in response.css("div.bp-head").css("h2").css("a::attr(href)").extract(): for link in response.css("div.bp-head").css("h2").css("a::attr(href)").extract():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment