Commit afe0647f authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

actualizadas fecha + zona horaria

parent 8b71564d
import scrapy
from datetime import datetime, date
from datetime import datetime, date, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22
......@@ -9,6 +9,19 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -21,7 +34,9 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
......@@ -60,12 +75,19 @@ class QuotesSpider(scrapy.Spider):
for post in response.xpath('//a[@class="lista_noticia"]'):
date = post.css('div.pad5').css('p.sec_autor2::text').extract_first()
news_date = datetime.strptime(date[21:31], '%Y-%m-%d').date()
# if ( self.year+'-'+self.month+'-'+self.day == date[21:31] ):
if ( news_date == self.date ):
item['date'] = news_date
date = date[21:]
news_date, news_time = date.split(' ')
news_date = map(int, news_date.split('-'))
news_time = map(int, news_time.split(':'))
item['date'] = datetime(news_date[0],news_date[1],news_date[2],news_time[0],news_time[1],news_time[2],tzinfo=self.tz).isoformat('T')
# item['date'] = news_date
item['topic'] = response.xpath('//div[@id="color_seccion"]/h1/text()').extract_first()
request = scrapy.Request(url=self.baseURL+post.css('::attr(href)').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
......
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,49 +29,63 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
item = NoticiasItem()
for post in response.xpath('//ul[@class="archivepost"]/li'):
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
item['topic'] = post.xpath('./p/a/text()').extract()
request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
text = ''
item = response.meta['item']
item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy
from datetime import date
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import scrapy, re
from datetime import datetime, timedelta, tzinfo
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
......@@ -10,6 +9,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para sonora (tiempo del pacifico): utc-7
return timedelta(hours=-7)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-7'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -22,29 +33,32 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = date(int(year), int(month), int(day))
self.date = datetime(int(year),int(month),int(day),tzinfo=tz).isoformat('T')
self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......@@ -60,10 +74,13 @@ class QuotesSpider(scrapy.Spider):
item['date'] = self.date
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract()
for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
from datetime import date
import scrapy
from datetime import date, datetime, timedelta, tzinfo, time
import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -23,7 +34,9 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -61,7 +74,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html':
......@@ -93,7 +107,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
......@@ -117,7 +132,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php':
......@@ -385,7 +401,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item_3(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
if ( len(title) > 0 ):
......@@ -410,7 +427,8 @@ class QuotesSpider(scrapy.Spider):
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
......
import scrapy, re
from datetime import datetime
from datetime import datetime, timedelta, tzinfo
"""
Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
......@@ -16,6 +16,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para guerrero (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -29,6 +41,7 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -76,7 +89,9 @@ class QuotesSpider(scrapy.Spider):
d = d.replace(',','')
m = d[:d.find(' ')]
d = d.replace(m, self.parse_month[m])
item['date'] = datetime.strptime(d, '%m %d %Y').date()
# item['date'] = datetime.strptime(d, '%m %d %Y').date()
d = map(int, d.split(' '))
item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
if title is not None:
......@@ -92,6 +107,7 @@ class QuotesSpider(scrapy.Spider):
item['text'] = text
item['url'] = response.url
# print item['url']
yield item
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -21,17 +33,17 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+self.day.zfill(2)+'/'+self.month.zfill(2)+'/'+self.year
urls = [
self.builtURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
yield scrapy.Request(url=self.builtURL, callback=self.parse)
def parse(self, response):
......@@ -59,10 +71,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['title'] = response.xpath('//h2[@class="article-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="content-article-title"]/h2/text()').extract()
item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
item['date'] = self.date
title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
title = title.replace('\r','')
title = title.replace('\n','')
title = title.lstrip(' ')
title = title.rstrip(' ')
item['title'] = title
topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
topic = topic.replace('\r','')
topic = topic.replace('\n','')
topic = topic.lstrip(' ')
topic = topic.rstrip(' ')
item['topic'] = topic
# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
if ( len(paragraph) > 0 ):
......
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,18 +31,20 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://lectormx.com/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
......@@ -43,14 +59,16 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//h2[@class="title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first()
......
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -17,18 +31,20 @@ class NoticiasItem(scrapy.Item):
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://gruporivas.com.mx/notirivas/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
......@@ -43,15 +59,17 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//article/header/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
item['date'] = self.year+'-'+self.month+'-'+self.day
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
......@@ -64,6 +82,7 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
import scrapy, re, datetime
import scrapy, re
from datetime import datetime, timedelta, tzinfo
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
......@@ -10,6 +11,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -24,6 +37,7 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -77,7 +91,7 @@ class QuotesSpider(scrapy.Spider):
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',','').split(' ')
item['date'] = datetime.date(int(d[2]), self.date_parser[d[0].lower()], int(d[1]))
item['date'] = datetime(int(d[2]), self.date_parser[d[0].lower()], int(d[1]), tzinfo=self.tz).isoformat('T')
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment