Commit e9188630 authored by Mario Chirinos's avatar Mario Chirinos

ljz date

parent 39b868fa
......@@ -18,12 +18,12 @@ class NoticiasSpider(scrapy.Spider):
start_urls = ['http://ljz.mx/']
#-----------------------------------------------------------------------
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://ljz.mx/" + self.day.zfill(2) + "/" + month.zfill(2) + "/" + year + "/"
self.date = self.year + "-" + self.month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://ljz.mx/" + self.day.zfill(2) + "/" + self.month.zfill(2) + "/" + self.year + "/"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
......@@ -40,13 +40,13 @@ class NoticiasSpider(scrapy.Spider):
def parse_item(self, response):
# print(response.url)
item = LajornadazacItem()
item["date"] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item["date"] = self.year+"-"+self.month+"-"+self.day
item["title"] = response.xpath('//meta[@itemprop="headline"]/@content').extract_first()
item["topic"] = response.xpath('//meta[@property="article:section"]/@content').extract_first()
item["author"] = response.xpath('//p[@class="ljz_coauthors"]/i//text()').extract_first()
text=""
for p in response.xpath('//p[@class="ljz_coauthors"]/following-sibling::div//p').extract():
text += remove_tags(p) + "\n"
text += remove_tags(p) + "\n "
item["text"]=text
item["url"]=response.url
print(self.allowed_domains,item["title"])
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment