Commit 2d809dad authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 11c4aa01
......@@ -6,16 +6,18 @@ import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -26,10 +28,11 @@ class QuotesSpider(scrapy.Spider):
self.baseURL='http://ljz.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//div[@class="pagination"]/a/@href').extract()
if ( len(pagination) > 0 ):
......@@ -43,14 +46,16 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//h2[@class="cat-list-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = response.xpath('//time[@class="entry-date updated"]/@datetime').extract_first()
item['date'] = response.xpath('//*[@class="entry-post-meta"]').css('time::attr(content)').extract_first()
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="entry-cat"]/a/text()').extract_first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment