Commit 9cf59fb3 authored by Mario Chirinos's avatar Mario Chirinos

modificacion el comentario

parent 673e21e0
...@@ -12,11 +12,11 @@ class NoticiasSpider(scrapy.Spider): ...@@ -12,11 +12,11 @@ class NoticiasSpider(scrapy.Spider):
start_urls = ['http://elcomentario.ucol.mx/'] start_urls = ['http://elcomentario.ucol.mx/']
def start_requests(self): def start_requests(self):
year = getattr(self, "year", None) self.year = getattr(self, "year", None)
month = getattr(self, "month", None) self.month = getattr(self, "month", None)
day = getattr(self, "day", None) self.day = getattr(self, "day", None)
baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2)) baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse) yield scrapy.Request(url=baseURL, callback=self.parse)
...@@ -24,26 +24,26 @@ class NoticiasSpider(scrapy.Spider): ...@@ -24,26 +24,26 @@ class NoticiasSpider(scrapy.Spider):
def parse(self, response): def parse(self, response):
print(response.url) print(response.url)
for link in response.xpath('//h5[@class="mkd-pt-six-title"]/a/@href').extract(): for link in response.xpath('//h2[@class="thumb-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url="https://elcomentario.ucol.mx"+link, callback=self.parse_item)
next_page = response.xpath('//li[@class="mkd-pagination-next"]/a/@href').extract_first() next_page = response.xpath('//li[@class="the-next-page"]/a/@href').extract_first()
print("next_page", next_page) print("next_page", next_page)
if next_page is not None: if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse) yield scrapy.Request(url=next_page, callback=self.parse)
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
def parse_item(self, response): def parse_item(self, response):
print(response.url) # print(response.url)
item = ElcomentarioItem() item = ElcomentarioItem()
item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first() item["date"] = self.year+"-"+self.month.zfill(2)+"-"+self.day.zfill(2)
item["title"] = response.xpath("//meta[@property='og:title']/@content").extract_first() item["title"] = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
item["topic"] = "" item["topic"] = response.xpath("//div[@class='entry-header']/span/a/text()").extract_first()
text="" text=""
for p in response.xpath('//div[@class="pf-content"]/p').extract(): for p in response.xpath('//div[@class="entry-content entry clearfix"]/p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
item["text"]=text item["text"]=text
item["url"]=response.url item["url"]=response.url
print(item["title"]) print(self.allowed_domains,item["title"])
yield(item) yield(item)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment