actualizadas fecha + zona horaria

afe0647f · Renán Sosa Guillen · 8b71564d · afe0647f · afe0647f · afe0647f
Commit afe0647f authored Oct 20, 2017 by Renán Sosa Guillen
18 changed files
--- a/descarga_hacia_atras/yucatanAlMinuto/yucatanAlMinuto/spiders/noticias.py
+++ b/descarga_hacia_atras/yucatanAlMinuto/yucatanAlMinuto/spiders/noticias.py
 import scrapy
-from datetime import datetime, date
+from datetime import datetime, date, timedelta, tzinfo
 #scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22
@@ -9,6 +9,19 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
 	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para yucatan (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
 	title = scrapy.Field()
 	text = scrapy.Field()
@@ -21,7 +34,9 @@ class NoticiasItem(scrapy.Item):
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
+		self.tz = UTC()
 		self.year = getattr(self, 'year', None)
 		self.month = getattr(self, 'month', None)
 		self.day = getattr(self, 'day', None)
@@ -60,12 +75,19 @@ class QuotesSpider(scrapy.Spider):
 		for post in response.xpath('//a[@class="lista_noticia"]'):
 			date = post.css('div.pad5').css('p.sec_autor2::text').extract_first()
 			news_date = datetime.strptime(date[21:31], '%Y-%m-%d').date()
-			# if ( self.year+'-'+self.month+'-'+self.day == date[21:31] ):
 			if ( news_date == self.date ):
-				item['date'] = news_date
+				date = date[21:]
+				news_date, news_time = date.split(' ')
+				news_date = map(int, news_date.split('-'))
+				news_time = map(int, news_time.split(':'))
+				item['date'] = datetime(news_date[0],news_date[1],news_date[2],news_time[0],news_time[1],news_time[2],tzinfo=self.tz).isoformat('T')
+				# item['date'] = news_date
 				item['topic'] = response.xpath('//div[@id="color_seccion"]/h1/text()').extract_first()
 				request = scrapy.Request(url=self.baseURL+post.css('::attr(href)').extract_first(), callback=self.parse_item)
 				request.meta['item'] = item
 				yield request

--- a/descarga_hacia_atras/yucatanAlMinuto/yucatanAlMinuto/spiders/noticias.pyc
+++ b/descarga_hacia_atras/yucatanAlMinuto/yucatanAlMinuto/spiders/noticias.pyc
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.py
-import scrapy
+import scrapy, re
+from datetime import datetime, timedelta, tzinfo
-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-import re
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-    return TAG_RE.sub('', text)
+		return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para yucatan (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
+		title = scrapy.Field()
-    text = scrapy.Field()
+		text = scrapy.Field()
-    date = scrapy.Field()
+		date = scrapy.Field()
-    location = scrapy.Field()
+		location = scrapy.Field()
-    author = scrapy.Field()
+		author = scrapy.Field()
-    topic = scrapy.Field()
+		topic = scrapy.Field()
-    url = scrapy.Field()
+		url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
+		self.tz = UTC()
 		self.year = getattr(self, 'year', None)
 		self.month = getattr(self, 'month', None)
 		self.day = getattr(self, 'day', None)
 		self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
-		urls = [
-			self.baseURL,
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
-                ]
-		for url in urls:
-			yield scrapy.Request(url=url, callback=self.parse)
 	def parse(self, response):
 		pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
 		if ( len(pagination) > 0 ):
 			pagination = pagination[-1].strip('/')
 			pages = int(pagination[pagination.rfind('/')+1:])
 			for page in range(0, pages):
 				if ( page == 0 ):
 					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
 				else:
 					yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
 		else:
 			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
 	def parse_page(self, response):
 		item = NoticiasItem()
 		for post in response.xpath('//ul[@class="archivepost"]/li'):
-			item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+			# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+			item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
 			item['topic'] = post.xpath('./p/a/text()').extract()
 			request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
 			request.meta['item'] = item
 			yield request
 	def parse_item(self, response):
 		text = ''
 		item = response.meta['item']
 		item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
 		for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
 			text += remove_tags(paragraph) + '\n'
 		item['text'] = text
 		item['url'] = response.url
 		# print item['title']
 		yield item
--- a/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
+++ b/descarga_por_dia/desdeElBalcon/desdeElBalcon/spiders/noticias.pyc
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.py
-import scrapy
+import scrapy, re
-from datetime import date
+from datetime import datetime, timedelta, tzinfo
-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-import re
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-    return TAG_RE.sub('', text)
+	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para sonora (tiempo del pacifico): utc-7
+		return timedelta(hours=-7)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-7'
 class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
+	title = scrapy.Field()
-    text = scrapy.Field()
+	text = scrapy.Field()
-    date = scrapy.Field()
+	date = scrapy.Field()
-    location = scrapy.Field()
+	location = scrapy.Field()
-    author = scrapy.Field()
+	author = scrapy.Field()
-    topic = scrapy.Field()
+	topic = scrapy.Field()
-    url = scrapy.Field()
+	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
+		tz = UTC()
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
-		self.date = date(int(year), int(month), int(day))
+		self.date = datetime(int(year),int(month),int(day),tzinfo=tz).isoformat('T')
 		self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day
-		urls = [
-			self.baseURL,
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
-                ]
-		for url in urls:
-			yield scrapy.Request(url=self.baseURL, callback=self.parse)
 	def parse(self, response):
 		pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
 		if ( len(pagination) > 0 ):
 			pagination = pagination[-2].strip('/')
 			pages = int(pagination[pagination.rfind('/')+1:])
 			for page in range(0,pages):
 				if ( page == 0 ):
 					yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
 				else:
 					yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
 		else:
 			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
@@ -60,10 +74,13 @@ class QuotesSpider(scrapy.Spider):
 		item['date'] = self.date
 		item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
 		item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract()
 		for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
 			text += remove_tags(paragraph) + '\n'
 		item['text'] = text
 		item['url'] = response.url
 		# print item['title']
 		yield item
--- a/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.pyc
+++ b/descarga_por_dia/diarioYaqui/diarioYaqui/spiders/noticias.pyc
--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.py
-from datetime import date
+from datetime import date, datetime, timedelta, tzinfo, time
-import scrapy
+import scrapy, re
-# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-import re
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
 	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para centro de mexico: utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
 	title = scrapy.Field()
 	text = scrapy.Field()
@@ -23,7 +34,9 @@ class NoticiasItem(scrapy.Item):
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
+		self.tz = UTC()
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
@@ -61,7 +74,8 @@ class QuotesSpider(scrapy.Spider):
 			for s in section_list:
 				item = NoticiasItem()
-				item['date'] = self.date
+				# item['date'] = self.date
+				item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
 				item['topic'] = parse_s[s]
 				if s == 'edito.html' or s == 'correo.html':
@@ -93,7 +107,8 @@ class QuotesSpider(scrapy.Spider):
 			for s in section_list:
 				item = NoticiasItem()
-				item['date'] = self.date
+				# item['date'] = self.date
+				item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
 				item['topic'] = parse_s[s]
 				if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
@@ -117,7 +132,8 @@ class QuotesSpider(scrapy.Spider):
 			for s in section_list:
 				item = NoticiasItem()
-				item['date'] = self.date
+				# item['date'] = self.date
+				item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
 				item['topic'] = parse_s[s]
 				if s == 'edito.php' or s == 'correo.php':
@@ -385,7 +401,8 @@ class QuotesSpider(scrapy.Spider):
 	def parse_item_3(self, response):
 		item = NoticiasItem()
 		text = ''
-		item['date'] = self.date
+		# item['date'] = self.date
+		item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
 		title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
 		if ( len(title) > 0 ):
@@ -410,7 +427,8 @@ class QuotesSpider(scrapy.Spider):
 		# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
 		path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
-		item['date'] = self.date
+		# item['date'] = self.date
+		item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
 		item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
 		item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()

--- a/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornada/laJornada/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.py
 import scrapy, re
-from datetime import datetime
+from datetime import datetime, timedelta, tzinfo
 """
 Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
@@ -16,6 +16,18 @@ def remove_tags(text):
 	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para guerrero (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
 	title = scrapy.Field()
 	text = scrapy.Field()
@@ -29,6 +41,7 @@ class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
+		self.tz = UTC()
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
@@ -76,7 +89,9 @@ class QuotesSpider(scrapy.Spider):
 			d = d.replace(',','')
 			m = d[:d.find(' ')]
 			d = d.replace(m, self.parse_month[m])
-			item['date'] = datetime.strptime(d, '%m %d %Y').date()
+			# item['date'] = datetime.strptime(d, '%m %d %Y').date()
+			d = map(int, d.split(' '))
+			item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
 		title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
 		if title is not None:
@@ -92,6 +107,7 @@ class QuotesSpider(scrapy.Spider):
 		item['text'] = text
 		item['url'] = response.url
 		# print item['url']
 		yield item
--- a/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaGro/laJornadaGro/spiders/noticias.pyc
--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.py
-import scrapy
+import scrapy, re
+from datetime import datetime, timedelta, tzinfo
-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-import re
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-    return TAG_RE.sub('', text)
+	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para veracruz (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
+	title = scrapy.Field()
-    text = scrapy.Field()
+	text = scrapy.Field()
-    date = scrapy.Field()
+	date = scrapy.Field()
-    location = scrapy.Field()
+	location = scrapy.Field()
-    author = scrapy.Field()
+	author = scrapy.Field()
-    topic = scrapy.Field()
+	topic = scrapy.Field()
-    url = scrapy.Field()
+	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
-		self.year = getattr(self, 'year', None)
+		tz = UTC()
-		self.month = getattr(self, 'month', None)
+		year = getattr(self, 'year', None)
-		self.day = getattr(self, 'day', None)
+		month = getattr(self, 'month', None)
+		day = getattr(self, 'day', None)
+		self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
 		self.baseURL = 'http://www.jornadaveracruz.com.mx/'
-		self.builtURL= self.baseURL+'Archive.aspx?date='+self.day.zfill(2)+'/'+self.month.zfill(2)+'/'+self.year
+		self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
-		urls = [
-			self.builtURL,
+		yield scrapy.Request(url=self.builtURL, callback=self.parse)
-                ]
-		for url in urls:
-			yield scrapy.Request(url=url, callback=self.parse)
 	def parse(self, response):
@@ -59,10 +71,22 @@ class QuotesSpider(scrapy.Spider):
 	def parse_item(self, response):
 		item = NoticiasItem()
 		text = ''
-		item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+		item['date'] = self.date
-		item['title'] = response.xpath('//h2[@class="article-title"]/text()').extract_first()
-		item['topic'] = response.xpath('//*[@class="content-article-title"]/h2/text()').extract()
+		title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
-		item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
+		title = title.replace('\r','')
+		title = title.replace('\n','')
+		title = title.lstrip(' ')
+		title = title.rstrip(' ')
+		item['title'] = title
+		topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
+		topic = topic.replace('\r','')
+		topic = topic.replace('\n','')
+		topic = topic.lstrip(' ')
+		topic = topic.rstrip(' ')
+		item['topic'] = topic
+		# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
 		paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
 		if ( len(paragraph) > 0 ):

--- a/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.pyc
+++ b/descarga_por_dia/laJornadaVer/laJornadaVer/spiders/noticias.pyc
--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.py
-import scrapy
+import scrapy, re
+from datetime import datetime, timedelta, tzinfo
-#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
-import re
+## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-   return TAG_RE.sub('', text)
+	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para yucatan (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
+	title = scrapy.Field()
-    text = scrapy.Field()
+	text = scrapy.Field()
-    date = scrapy.Field()
+	date = scrapy.Field()
-    location = scrapy.Field()
+	location = scrapy.Field()
-    author = scrapy.Field()
+	author = scrapy.Field()
-    topic = scrapy.Field()
+	topic = scrapy.Field()
-    url = scrapy.Field()
+	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
-		self.year = getattr(self, 'year', None)
+		tz = UTC()
-		self.month = getattr(self, 'month', None)
+		year = getattr(self, 'year', None)
-		self.day = getattr(self, 'day', None)
+		month = getattr(self, 'month', None)
-		self.baseURL='http://lectormx.com/'+self.year+'/'+self.month+'/'+self.day
+		day = getattr(self, 'day', None)
-		urls = [
+		self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-			self.baseURL,
+		self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
-               ]
-		for url in urls:
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
-			yield scrapy.Request(url=url, callback=self.parse)
 	def parse(self, response):
 		pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
@@ -43,14 +59,16 @@ class QuotesSpider(scrapy.Spider):
 		else:
 			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
 	def parse_page(self, response):
 		for link in response.xpath('//h2[@class="title"]/a/@href').extract():
-		 	yield scrapy.Request(url=link, callback=self.parse_item)
+			yield scrapy.Request(url=link, callback=self.parse_item)
 	def parse_item(self, response):
 		text = ''
 		item = NoticiasItem()
-		item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
+		item['date'] = self.date
 		item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
 		item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
 		item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first()

--- a/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.pyc
+++ b/descarga_por_dia/lectorMX/lectorMX/spiders/noticias.pyc
--- a/descarga_por_dia/notirivas/notirivas/spiders/noticias.py
+++ b/descarga_por_dia/notirivas/notirivas/spiders/noticias.py
-import scrapy
+import scrapy, re
+from datetime import datetime, timedelta, tzinfo
 #scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
-import re
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
-   return TAG_RE.sub('', text)
+	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para yucatan (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
-    title = scrapy.Field()
+	title = scrapy.Field()
-    text = scrapy.Field()
+	text = scrapy.Field()
-    date = scrapy.Field()
+	date = scrapy.Field()
-    location = scrapy.Field()
+	location = scrapy.Field()
-    author = scrapy.Field()
+	author = scrapy.Field()
-    topic = scrapy.Field()
+	topic = scrapy.Field()
-    url = scrapy.Field()
+	url = scrapy.Field()
 class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
-		self.year = getattr(self, 'year', None)
+		tz = UTC()
-		self.month = getattr(self, 'month', None)
+		year = getattr(self, 'year', None)
-		self.day = getattr(self, 'day', None)
+		month = getattr(self, 'month', None)
-		self.baseURL='http://gruporivas.com.mx/notirivas/'+self.year+'/'+self.month+'/'+self.day
+		day = getattr(self, 'day', None)
-		urls = [
+		self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
-			self.baseURL,
+		self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
-               ]
-		for url in urls:
+		yield scrapy.Request(url=self.baseURL, callback=self.parse)
-			yield scrapy.Request(url=url, callback=self.parse)
 	def parse(self, response):
 		pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
@@ -43,15 +59,17 @@ class QuotesSpider(scrapy.Spider):
 		else:
 			yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
 	def parse_page(self, response):
 		for link in response.xpath('//article/header/h2/a/@href').extract():
-		 	yield scrapy.Request(url=link, callback=self.parse_item)
+			yield scrapy.Request(url=link, callback=self.parse_item)
 	def parse_item(self, response):
 		item = NoticiasItem()
 		text = ''
+		item['date'] = self.date
 		item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
-		item['date'] = self.year+'-'+self.month+'-'+self.day
 		item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
 		content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
@@ -64,6 +82,7 @@ class QuotesSpider(scrapy.Spider):
 			text += remove_tags(paragraph) + '\n'
 		item['text'] = text
 		item['url'] = response.url
 		# print item['title']
 		yield item
--- a/descarga_por_dia/notirivas/notirivas/spiders/noticias.pyc
+++ b/descarga_por_dia/notirivas/notirivas/spiders/noticias.pyc
--- a/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
+++ b/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.py
-import scrapy, re, datetime
+import scrapy, re
+from datetime import datetime, timedelta, tzinfo
 '''
 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
@@ -10,6 +11,18 @@ def remove_tags(text):
 	return TAG_RE.sub('', text)
+class UTC(tzinfo):
+	"""clase para el 'time zone' (zona horaria)"""
+	def utcoffset(self, dt):
+		# zona horaria para hidalgo (centro de mexico): utc-6
+		return timedelta(hours=-6)
+	def tzname(self, dt):
+		# nombre de la zona horaria
+		return 'UTC-6'
 class NoticiasItem(scrapy.Item):
 	title = scrapy.Field()
 	text = scrapy.Field()
@@ -24,6 +37,7 @@ class QuotesSpider(scrapy.Spider):
 	name = "noticias"
 	def start_requests(self):
+		self.tz = UTC()
 		year = getattr(self, 'year', None)
 		month = getattr(self, 'month', None)
 		day = getattr(self, 'day', None)
@@ -77,7 +91,7 @@ class QuotesSpider(scrapy.Spider):
 		d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
 		d = d.replace(',','').split(' ')
-		item['date'] = datetime.date(int(d[2]), self.date_parser[d[0].lower()], int(d[1]))
+		item['date'] = datetime(int(d[2]), self.date_parser[d[0].lower()], int(d[1]), tzinfo=self.tz).isoformat('T')
 		item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
 		item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()

--- a/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc
+++ b/descarga_por_dia/unoMasUno/unoMasUno/spiders/noticias.pyc