Commit 9cf59fb3 authored by Mario Chirinos's avatar Mario Chirinos

modificacion el comentario

parent 673e21e0
......@@ -12,11 +12,11 @@ class NoticiasSpider(scrapy.Spider):
start_urls = ['http://elcomentario.ucol.mx/']
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
baseURL = "https://elcomentario.ucol.mx/{0}/{1}/{2}/".format(self.year, self.month.zfill(2), self.day.zfill(2))
yield scrapy.Request(url=baseURL, callback=self.parse)
......@@ -24,26 +24,26 @@ class NoticiasSpider(scrapy.Spider):
def parse(self, response):
print(response.url)
for link in response.xpath('//h5[@class="mkd-pt-six-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
for link in response.xpath('//h2[@class="thumb-title"]/a/@href').extract():
yield scrapy.Request(url="https://elcomentario.ucol.mx"+link, callback=self.parse_item)
next_page = response.xpath('//li[@class="mkd-pagination-next"]/a/@href').extract_first()
next_page = response.xpath('//li[@class="the-next-page"]/a/@href').extract_first()
print("next_page", next_page)
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
#-----------------------------------------------------------------------
def parse_item(self, response):
print(response.url)
# print(response.url)
item = ElcomentarioItem()
item["date"] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
item["title"] = response.xpath("//meta[@property='og:title']/@content").extract_first()
item["topic"] = ""
item["date"] = self.year+"-"+self.month.zfill(2)+"-"+self.day.zfill(2)
item["title"] = response.xpath("//div[@class='entry-header']/h1/text()").extract_first()
item["topic"] = response.xpath("//div[@class='entry-header']/span/a/text()").extract_first()
text=""
for p in response.xpath('//div[@class="pf-content"]/p').extract():
for p in response.xpath('//div[@class="entry-content entry clearfix"]/p').extract():
text += remove_tags(p) + "\n"
item["text"]=text
item["url"]=response.url
print(item["title"])
print(self.allowed_domains,item["title"])
yield(item)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment