import scrapy, re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 """ Crawler descarga noticias hasta el 2017.10.17 """ TAG_RE = re.compile(r'<[^>]+>') def remove_tags(text): return TAG_RE.sub('', text) class NoticiasItem(scrapy.Item): title = scrapy.Field() text = scrapy.Field() date = scrapy.Field() location = scrapy.Field() author = scrapy.Field() topic = scrapy.Field() url = scrapy.Field() class QuotesSpider(scrapy.Spider): name = "noticias" def start_requests(self): year = getattr(self, 'year', None) month = getattr(self, 'month', None) day = getattr(self, 'day', None) self.baseURL='http://florcastillo.mx/noticias/'+year+'/'+month+'/'+day yield scrapy.Request(url=self.baseURL, callback=self.parse) def parse(self, response): pagination = response.css('div.pagination').css('a::attr(href)').extract() if ( len(pagination) > 0 ): pagination = pagination[-1] pages = int(pagination[pagination.rfind('/')+1:]) for page in range(0, pages): if ( page == 0 ): yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) else: yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) else: yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) def parse_page(self, response): for link in response.css('div.list-block').xpath('./h3/a/@href').extract(): yield scrapy.Request(url=link, callback=self.parse_item) def parse_item(self, response): item = NoticiasItem() text = '' item['title'] = response.css('div.post-title').css('h1.entry-title::text').extract_first() d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() ## '-06:00' corresponde al UTC-6, zona horaria de yucatan (centro de mexico) if d[-6:] != '-06:00': d = d[:-6] + '-06:00' item['date'] = d item['topic'] = response.xpath('//ul[@id="tpcrn-breadcrumbs"]/li[2]/a/text()').extract_first()[:-2] for paragraph in response.css('div.post_content').css('p').extract(): text += remove_tags(paragraph) + '\n' item['text'] = text item['url'] = response.url # print item['title'] yield item