Commit afe0647f authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

actualizadas fecha + zona horaria

parent 8b71564d
import scrapy import scrapy
from datetime import datetime, date from datetime import datetime, date, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22 #scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22
...@@ -9,6 +9,19 @@ TAG_RE = re.compile(r'<[^>]+>') ...@@ -9,6 +9,19 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -21,7 +34,9 @@ class NoticiasItem(scrapy.Item): ...@@ -21,7 +34,9 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
...@@ -60,12 +75,19 @@ class QuotesSpider(scrapy.Spider): ...@@ -60,12 +75,19 @@ class QuotesSpider(scrapy.Spider):
for post in response.xpath('//a[@class="lista_noticia"]'): for post in response.xpath('//a[@class="lista_noticia"]'):
date = post.css('div.pad5').css('p.sec_autor2::text').extract_first() date = post.css('div.pad5').css('p.sec_autor2::text').extract_first()
news_date = datetime.strptime(date[21:31], '%Y-%m-%d').date() news_date = datetime.strptime(date[21:31], '%Y-%m-%d').date()
# if ( self.year+'-'+self.month+'-'+self.day == date[21:31] ):
if ( news_date == self.date ): if ( news_date == self.date ):
item['date'] = news_date date = date[21:]
news_date, news_time = date.split(' ')
news_date = map(int, news_date.split('-'))
news_time = map(int, news_time.split(':'))
item['date'] = datetime(news_date[0],news_date[1],news_date[2],news_time[0],news_time[1],news_time[2],tzinfo=self.tz).isoformat('T')
# item['date'] = news_date
item['topic'] = response.xpath('//div[@id="color_seccion"]/h1/text()').extract_first() item['topic'] = response.xpath('//div[@id="color_seccion"]/h1/text()').extract_first()
request = scrapy.Request(url=self.baseURL+post.css('::attr(href)').extract_first(), callback=self.parse_item) request = scrapy.Request(url=self.baseURL+post.css('::attr(href)').extract_first(), callback=self.parse_item)
request.meta['item'] = item request.meta['item'] = item
yield request yield request
......
import scrapy import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None) self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None) self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None) self.day = getattr(self, 'day', None)
self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract() pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
item = NoticiasItem() item = NoticiasItem()
for post in response.xpath('//ul[@class="archivepost"]/li'): for post in response.xpath('//ul[@class="archivepost"]/li'):
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) # item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
item['topic'] = post.xpath('./p/a/text()').extract() item['topic'] = post.xpath('./p/a/text()').extract()
request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item) request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item request.meta['item'] = item
yield request yield request
def parse_item(self, response): def parse_item(self, response):
text = '' text = ''
item = response.meta['item'] item = response.meta['item']
item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first() item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
for paragraph in response.xpath('//div[@itemprop="text"]/p').extract(): for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy import scrapy, re
from datetime import date from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para sonora (tiempo del pacifico): utc-7
return timedelta(hours=-7)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-7'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
self.date = date(int(year), int(month), int(day)) self.date = datetime(int(year),int(month),int(day),tzinfo=tz).isoformat('T')
self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL, yield scrapy.Request(url=self.baseURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract() pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/') pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(0,pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -60,10 +74,13 @@ class QuotesSpider(scrapy.Spider): ...@@ -60,10 +74,13 @@ class QuotesSpider(scrapy.Spider):
item['date'] = self.date item['date'] = self.date
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first() item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract() item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract()
for paragraph in response.xpath('//div[@class="clearfix"]/p').extract(): for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
from datetime import date from datetime import date, datetime, timedelta, tzinfo, time
import scrapy import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -23,7 +34,9 @@ class NoticiasItem(scrapy.Item): ...@@ -23,7 +34,9 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
...@@ -61,7 +74,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -61,7 +74,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list: for s in section_list:
item = NoticiasItem() item = NoticiasItem()
item['date'] = self.date # item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s] item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html': if s == 'edito.html' or s == 'correo.html':
...@@ -93,7 +107,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -93,7 +107,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list: for s in section_list:
item = NoticiasItem() item = NoticiasItem()
item['date'] = self.date # item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s] item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php': if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
...@@ -117,7 +132,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -117,7 +132,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list: for s in section_list:
item = NoticiasItem() item = NoticiasItem()
item['date'] = self.date # item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s] item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php': if s == 'edito.php' or s == 'correo.php':
...@@ -385,7 +401,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -385,7 +401,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item_3(self, response): def parse_item_3(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = self.date # item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract() title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
if ( len(title) > 0 ): if ( len(title) > 0 ):
...@@ -410,7 +427,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -410,7 +427,8 @@ class QuotesSpider(scrapy.Spider):
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p'] # path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]'] path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
item['date'] = self.date # item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first()) item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
......
import scrapy, re import scrapy, re
from datetime import datetime from datetime import datetime, timedelta, tzinfo
""" """
Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
...@@ -16,6 +16,18 @@ def remove_tags(text): ...@@ -16,6 +16,18 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para guerrero (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -29,6 +41,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -29,6 +41,7 @@ class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
...@@ -76,7 +89,9 @@ class QuotesSpider(scrapy.Spider): ...@@ -76,7 +89,9 @@ class QuotesSpider(scrapy.Spider):
d = d.replace(',','') d = d.replace(',','')
m = d[:d.find(' ')] m = d[:d.find(' ')]
d = d.replace(m, self.parse_month[m]) d = d.replace(m, self.parse_month[m])
item['date'] = datetime.strptime(d, '%m %d %Y').date() # item['date'] = datetime.strptime(d, '%m %d %Y').date()
d = map(int, d.split(' '))
item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first() title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
if title is not None: if title is not None:
...@@ -92,6 +107,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -92,6 +107,7 @@ class QuotesSpider(scrapy.Spider):
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['url'] # print item['url']
yield item yield item
import scrapy import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.year = getattr(self, 'year', None) tz = UTC()
self.month = getattr(self, 'month', None) year = getattr(self, 'year', None)
self.day = getattr(self, 'day', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL = 'http://www.jornadaveracruz.com.mx/' self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+self.day.zfill(2)+'/'+self.month.zfill(2)+'/'+self.year self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
urls = [
self.builtURL, yield scrapy.Request(url=self.builtURL, callback=self.parse)
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -59,10 +71,22 @@ class QuotesSpider(scrapy.Spider): ...@@ -59,10 +71,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) item['date'] = self.date
item['title'] = response.xpath('//h2[@class="article-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="content-article-title"]/h2/text()').extract() title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first() title = title.replace('\r','')
title = title.replace('\n','')
title = title.lstrip(' ')
title = title.rstrip(' ')
item['title'] = title
topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
topic = topic.replace('\r','')
topic = topic.replace('\n','')
topic = topic.lstrip(' ')
topic = topic.rstrip(' ')
item['topic'] = topic
# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract() paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
if ( len(paragraph) > 0 ): if ( len(paragraph) > 0 ):
......
import scrapy import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
import re ## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.year = getattr(self, 'year', None) tz = UTC()
self.month = getattr(self, 'month', None) year = getattr(self, 'year', None)
self.day = getattr(self, 'day', None) month = getattr(self, 'month', None)
self.baseURL='http://lectormx.com/'+self.year+'/'+self.month+'/'+self.day day = getattr(self, 'day', None)
urls = [ self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL, self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
]
for url in urls: yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract() pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
...@@ -43,14 +59,16 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,14 +59,16 @@ class QuotesSpider(scrapy.Spider):
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//h2[@class="title"]/a/@href').extract(): for link in response.xpath('//h2[@class="title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
text = '' text = ''
item = NoticiasItem() item = NoticiasItem()
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2) item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first() item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first() item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first() item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first()
......
import scrapy import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22 #scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.year = getattr(self, 'year', None) tz = UTC()
self.month = getattr(self, 'month', None) year = getattr(self, 'year', None)
self.day = getattr(self, 'day', None) month = getattr(self, 'month', None)
self.baseURL='http://gruporivas.com.mx/notirivas/'+self.year+'/'+self.month+'/'+self.day day = getattr(self, 'day', None)
urls = [ self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL, self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
]
for url in urls: yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract() pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
...@@ -43,15 +59,17 @@ class QuotesSpider(scrapy.Spider): ...@@ -43,15 +59,17 @@ class QuotesSpider(scrapy.Spider):
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response): def parse_page(self, response):
for link in response.xpath('//article/header/h2/a/@href').extract(): for link in response.xpath('//article/header/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item) yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
text = '' text = ''
item['date'] = self.date
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first() item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
item['date'] = self.year+'-'+self.month+'-'+self.day
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first() item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract() content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
...@@ -64,6 +82,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -64,6 +82,7 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
# print item['title'] # print item['title']
yield item yield item
import scrapy, re, datetime import scrapy, re
from datetime import datetime, timedelta, tzinfo
''' '''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22 scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
...@@ -10,6 +11,18 @@ def remove_tags(text): ...@@ -10,6 +11,18 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
...@@ -24,6 +37,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -24,6 +37,7 @@ class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None) year = getattr(self, 'year', None)
month = getattr(self, 'month', None) month = getattr(self, 'month', None)
day = getattr(self, 'day', None) day = getattr(self, 'day', None)
...@@ -77,7 +91,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -77,7 +91,7 @@ class QuotesSpider(scrapy.Spider):
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first() d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',','').split(' ') d = d.replace(',','').split(' ')
item['date'] = datetime.date(int(d[2]), self.date_parser[d[0].lower()], int(d[1])) item['date'] = datetime(int(d[2]), self.date_parser[d[0].lower()], int(d[1]), tzinfo=self.tz).isoformat('T')
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1] item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first() item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment