Commit e9188630 authored by Mario Chirinos's avatar Mario Chirinos

ljz date

parent 39b868fa
...@@ -18,12 +18,12 @@ class NoticiasSpider(scrapy.Spider): ...@@ -18,12 +18,12 @@ class NoticiasSpider(scrapy.Spider):
start_urls = ['http://ljz.mx/'] start_urls = ['http://ljz.mx/']
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
def start_requests(self): def start_requests(self):
year = getattr(self, "year", None) self.year = getattr(self, "year", None)
month = getattr(self, "month", None) self.month = getattr(self, "month", None)
day = getattr(self, "day", None) self.day = getattr(self, "day", None)
self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2) self.date = self.year + "-" + self.month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://ljz.mx/" + self.day.zfill(2) + "/" + month.zfill(2) + "/" + year + "/" self.baseURL = "https://ljz.mx/" + self.day.zfill(2) + "/" + self.month.zfill(2) + "/" + self.year + "/"
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -40,13 +40,13 @@ class NoticiasSpider(scrapy.Spider): ...@@ -40,13 +40,13 @@ class NoticiasSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
# print(response.url) # print(response.url)
item = LajornadazacItem() item = LajornadazacItem()
item["date"] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item["date"] = self.year+"-"+self.month+"-"+self.day
item["title"] = response.xpath('//meta[@itemprop="headline"]/@content').extract_first() item["title"] = response.xpath('//meta[@itemprop="headline"]/@content').extract_first()
item["topic"] = response.xpath('//meta[@property="article:section"]/@content').extract_first() item["topic"] = response.xpath('//meta[@property="article:section"]/@content').extract_first()
item["author"] = response.xpath('//p[@class="ljz_coauthors"]/i//text()').extract_first() item["author"] = response.xpath('//p[@class="ljz_coauthors"]/i//text()').extract_first()
text="" text=""
for p in response.xpath('//p[@class="ljz_coauthors"]/following-sibling::div//p').extract(): for p in response.xpath('//p[@class="ljz_coauthors"]/following-sibling::div//p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n "
item["text"]=text item["text"]=text
item["url"]=response.url item["url"]=response.url
print(self.allowed_domains,item["title"]) print(self.allowed_domains,item["title"])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment