Commit 3ea10cf1 authored by Mario Chirinos's avatar Mario Chirinos

heraldo leon

parent 43fbb106
......@@ -20,12 +20,12 @@ class NoticiasSpider(scrapy.Spider):
start_urls = ['http://adnoticias.mx/']
#-----------------------------------------------------------------------
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://adnoticias.mx/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
self.date = self.year + "-" + self.month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://adnoticias.mx/" + self.year + "/" + self. month.zfill(2) + "/" + self.day.zfill(2) + "/"
yield scrapy.Request(url=self.baseURL, callback=self.parseList)
#-----------------------------------------------------------------------
def parseList(self, response):
......
......@@ -17,7 +17,7 @@ FEED_EXPORT_ENCODING="utf-8"
#USER_AGENT = 'heraldoLeon (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
......@@ -25,7 +25,7 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
......@@ -41,7 +41,10 @@ ROBOTSTXT_OBEY = True
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
DEFAULT_REQUEST_HEADERS = {
# ... Other headers
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
......
......@@ -16,15 +16,14 @@ def remove_tags(text):
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['heraldoleon.mx']
start_urls = ['http://heraldoleon.mx/']
start_urls = ['https://heraldoleon.mx/']
def start_requests(self):
self.year = getattr(self, "year", None)
self.month = getattr(self, "month", None)
self.day = getattr(self, "day", None)
self.baseURL = "http://www.heraldoleon.mx/" + self.year + "/" + self.month + "/" + self.day
self.baseURL = "https://www.heraldoleon.mx/" + self.year + "/" + self.month.zfill(2) + "/" + self.day.zfill(2) + "/"
yield scrapy.Request(url=self.baseURL, callback=self.parse)
#-----------------------------------------------------------------------
def parse(self, response):
......@@ -32,8 +31,8 @@ class NoticiasSpider(scrapy.Spider):
for link in response.xpath('//h3[@class="entry-title td-module-title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
next_page = response.xpath('//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href').extract_first()
next_page = response.xpath('//div[contains(@class,"page-nav")]/a[@aria-label="next-page"]/@href').extract_first()
# next_page = response.xpath('//div[@class="page-nav td-pb-padding-side"]/a/i[@class="td-icon-menu-right"]/../@href').extract_first()
print("nextPage", next_page)
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse)
......@@ -42,18 +41,18 @@ class NoticiasSpider(scrapy.Spider):
# print(response.url)
item = HeraldoleonItem()
item['date'] = response.xpath("//meta[@property='article:published_time']/@content").extract_first()
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
# item['title'] = response.xpath("//meta[@property='og:title']/@content").extract_first()
item['title'] = response.xpath('//h1[@class="tdb-title-text"]/text()').extract_first()
text=""
for p in response.xpath('//div[@class="td-post-content"]/p/text()').extract():
nt = remove_tags(p).replace("\n","").replace("\r","").strip()
text+=nt
if len(nt)>0:
text+="\n"
for p in response.xpath('//div[contains(@class, "td-post-content")]/div/p').extract():
tt = remove_tags(p)+ "\n"
text+=tt
item['text'] = text.strip()
item['topic'] = ", ".join(response.xpath('//ul[@class="td-tags td-post-small-box clearfix"]/li/a/text()').extract())
item['topic'] = response.xpath('//ul[@class="tdb-tags"]/li/a/text()').extract()
item['url'] = response.url
item["author"]=", ".join(response.xpath('//div[@class="td-post-source-via "]/div/a/text()').extract())
item["location"]=""
print(self.allowed_domains, item["title"])
item["author"]=", ".join(response.xpath('//div[contains(@class, "tdb_single_via ")]/div/a/text()').extract())
# item["location"]=""
print(item["title"])
yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment