el sur

57f362cd · Mario Chirinos Colunga · eac90030 · 57f362cd · 57f362cd · 57f362cd
Commit 57f362cd authored Apr 15, 2020 by Mario Chirinos Colunga 💬
5 changed files
--- a/descarga_por_dia/elSur/elSur/items.py
+++ b/descarga_por_dia/elSur/elSur/items.py
@@ -9,12 +9,12 @@ import scrapy


 class NoticiasItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    title = scrapy.Field()
-    text = scrapy.Field()
-    date = scrapy.Field()
-    location = scrapy.Field()
-    author = scrapy.Field()
-    topic = scrapy.Field()
-    url = scrapy.Field()
+	# define the fields for your item here like:
+	# name = scrapy.Field()
+	title = scrapy.Field()
+	text = scrapy.Field()
+	date = scrapy.Field()
+	location = scrapy.Field()
+	author = scrapy.Field()
+	topic = scrapy.Field()
+	url = scrapy.Field()
--- a/descarga_por_dia/elSur/elSur/settings.py
+++ b/descarga_por_dia/elSur/elSur/settings.py
@@ -8,7 +8,7 @@
 #     https://doc.scrapy.org/en/latest/topics/settings.html
 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
-
+FEED_EXPORT_ENCODING = 'utf-8'
 BOT_NAME = 'elSur'

 SPIDER_MODULES = ['elSur.spiders']

--- a/descarga_por_dia/elSur/elSur/spiders/noticias.bk
+++ b/descarga_por_dia/elSur/elSur/spiders/noticias.bk
+"""
+MEDIA:
+    El Sur, Guerrero
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elSur/
+    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+"""
+import scrapy, re
+from elSur.items import NoticiasItem
+
+
+#TAG_RE = re.compile(r'<[^>]+>')
+#HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+#HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
+
+def remove_tags(text):
+	return TAG_RE.sub('', text)
+
+class QuotesSpider(scrapy.Spider):
+	"""
+	Basic Scrapy Spider class
+	"""
+	name = "noticias"
+
+
+	def start_requests(self):
+		year  = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day   = getattr(self, "day", None)
+
+		self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		print(self.baseURL)
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+		
+	def parse(self, response):
+		print(response)
+##		for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
+		for link in set(response.css('#post-list').css("li > div > div > a::attr(href)").extract():
+			print(link)
+			yield scrapy.Request(url=link, callback=self.parse_item)
+#		pag_lst = response.css('div.dslc-pagination > ul > li')
+#		if len(pag_lst) > 0:
+#			del pag_lst[0]
+#			del pag_lst[0]
+#			next_page = None
+
+#			for li_obj in pag_lst:
+#				li = remove_tags(li_obj.extract())
+#				if not li.isdigit():
+#					next_page = li_obj.xpath('./a/@href').extract_first()
+#					break
+#			    
+#		if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
+
+	def parse_item(self, response):
+		print(response)
+		item = NoticiasItem()
+#		text = ''
+
+#		news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+
+#		title = response.css('div.dslc-tp-title > h1').extract_first()
+#		if title is not None : title = remove_tags(title)
+
+#		topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
+#		if topic is not None : topic = remove_tags(topic)
+
+#		for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
+#			p = remove_tags(p)
+#			text += p + "\n"
+
+#		dateline = response.css('span.dateline').extract_first()
+#		if dateline is not None:
+#			dateline = remove_tags(dateline)
+#			text = text.replace(dateline, '')
+
+#		text = text.replace(u'\u00a0', ' ')
+#		text = HEAD_RE_1.sub('', text)
+#		text = HEAD_RE_2.sub('', text)
+
+#		## News item info ##
+#		item['date']  = news_date
+#		item['title'] = title
+#		item['topic'] = topic
+#		item['text']  = text.strip()
+#		item['url']   = response.url
+		yield item
--- a/descarga_por_dia/elSur/elSur/spiders/noticias.py
+++ b/descarga_por_dia/elSur/elSur/spiders/noticias.py
 # -*- coding: utf-8 -*-
-
 """
-MEDIA:
-    El Sur, Guerrero
-
-USAGE:
-    ## Get all the news from a specific date. ##
-    ---------------------------------------------------------------------------------------------
-    $ cd elSur/
-    $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
+	MEDIA:
+		El Sur, Guerrero
+
+	USAGE:
+		## Get all the news from a specific date. ##
+		---------------------------------------------------------------------------------------------
+		$ cd elSur/
+		$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
 """
-
 import scrapy, re
 from elSur.items import NoticiasItem
-
-
-TAG_RE = re.compile(r'<[^>]+>')
-def remove_tags(text):
-    return TAG_RE.sub('', text)
-
-HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
-HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
-
-
+#from dateutil.parser import parse
+from dateparser import parse
+from datetime import datetime
 class QuotesSpider(scrapy.Spider):
-    """
-    Basic Scrapy Spider class
-    """
-    name = "noticias"
-
-
-    def start_requests(self):
-        year  = getattr(self, "year", None)
-        month = getattr(self, "month", None)
-        day   = getattr(self, "day", None)
-
-        baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
-        
-        yield scrapy.Request(url=baseURL, callback=self.parse)
-
-
-
-    def parse(self, response):
-        for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
-            yield scrapy.Request(url=link, callback=self.parse_item)
-
-        pag_lst = response.css('div.dslc-pagination > ul > li')
-        if len(pag_lst) > 0:
-            del pag_lst[0]
-            del pag_lst[0]
-            next_page = None
-
-            for li_obj in pag_lst:
-                li = remove_tags(li_obj.extract())
-                if not li.isdigit():
-                    next_page = li_obj.xpath('./a/@href').extract_first()
-                    break
-                
-            if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
-
-
-
-    def parse_item(self, response):
-        item = NoticiasItem()
-        text = ''
-
-        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-
-        title = response.css('div.dslc-tp-title > h1').extract_first()
-        if title is not None : title = remove_tags(title)
-
-        topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
-        if topic is not None : topic = remove_tags(topic)
-
-        for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
-            p = remove_tags(p)
-            text += p + "\n"
-
-        dateline = response.css('span.dateline').extract_first()
-        if dateline is not None:
-            dateline = remove_tags(dateline)
-            text = text.replace(dateline, '')
-
-        text = text.replace(u'\u00a0', ' ')
-        text = HEAD_RE_1.sub('', text)
-        text = HEAD_RE_2.sub('', text)
-
-        ## News item info ##
-        item['date']  = news_date
-        item['title'] = title
-        item['topic'] = topic
-        item['text']  = text.strip()
-        item['url']   = response.url
-
-        yield item
+	"""
+		Basic Scrapy Spider class
+	"""
+	name = "noticias"
+
+
+	def start_requests(self):
+		year  = getattr(self, "year", None)
+		month = getattr(self, "month", None)
+		day   = getattr(self, "day", None)
+
+		self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
+		print(self.baseURL)
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
+		
+	def parse(self, response):
+		print(response)
+
+		for link in response.css('#post-list').css("li > div > div > a::attr(href)").extract():
+			print(link)
+			yield scrapy.Request(url=link, callback=self.parse_item)
+
+		if len(response.css(".paging-navigation > a::attr(href)").extract())>0:
+			next_page =  response.css(".paging-navigation > a::attr(href)").extract()[0]
+			print(next_page)
+			if next_page is not None: yield scrapy.Request(url=next_page, callback=self.parse)
+
+	def parse_item(self, response):
+		print(response.encoding)
+		date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+		title = response.xpath('//meta[@property="og:title"]/@content').extract_first().lower()
+		topic = response.xpath('//meta[@property="article:section"]/@content').extract_first().lower()
+		text = ""
+		for p in response.css("div.xt-post-content > p::text").extract():
+			text+= p.replace("\n", "")+"\n"
+			
+		item = NoticiasItem()
+		item['date'] =  datetime.fromtimestamp(int(date)).isoformat()
+		item['title'] = title
+		item['topic'] = topic
+		item['text']  = text
+		item['url']   = response.url
+		if item["text"] !="":
+			print(item)
+			yield item
+		else:
+			yield
--- a/descarga_por_dia/elSur/out_test.json
+++ b/descarga_por_dia/elSur/out_test.json