Commit d895ebe3 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 3885bd5c
import scrapy, re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......
import scrapy, re
from datetime import datetime, timedelta, tzinfo
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
"""
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
......@@ -46,12 +48,12 @@ class QuotesSpider(scrapy.Spider):
def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if ( len(pagination) > 0 ):
if len(pagination) > 0:
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
if page == 0:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment