Commit afe0647f authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

actualizadas fecha + zona horaria

parent 8b71564d
import scrapy
from datetime import datetime, date
from datetime import datetime, date, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=03 -a day=22
......@@ -9,6 +9,19 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -21,7 +34,9 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
......@@ -60,12 +75,19 @@ class QuotesSpider(scrapy.Spider):
for post in response.xpath('//a[@class="lista_noticia"]'):
date = post.css('div.pad5').css('p.sec_autor2::text').extract_first()
news_date = datetime.strptime(date[21:31], '%Y-%m-%d').date()
# if ( self.year+'-'+self.month+'-'+self.day == date[21:31] ):
if ( news_date == self.date ):
item['date'] = news_date
date = date[21:]
news_date, news_time = date.split(' ')
news_date = map(int, news_date.split('-'))
news_time = map(int, news_time.split(':'))
item['date'] = datetime(news_date[0],news_date[1],news_date[2],news_time[0],news_time[1],news_time[2],tzinfo=self.tz).isoformat('T')
# item['date'] = news_date
item['topic'] = response.xpath('//div[@id="color_seccion"]/h1/text()').extract_first()
request = scrapy.Request(url=self.baseURL+post.css('::attr(href)').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
......
No preview for this file type
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://www.desdeelbalcon.com/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
item = NoticiasItem()
for post in response.xpath('//ul[@class="archivepost"]/li'):
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
# item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = datetime(int(self.year),int(self.month),int(self.day),tzinfo=self.tz).isoformat('T')
item['topic'] = post.xpath('./p/a/text()').extract()
request = scrapy.Request(url=post.xpath('./h2/a/@href').extract_first(), callback=self.parse_item)
request.meta['item'] = item
yield request
def parse_item(self, response):
text = ''
item = response.meta['item']
item['title'] = response.xpath('//h1[@class="post entry-title"]/a/text()').extract_first()
for paragraph in response.xpath('//div[@itemprop="text"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
No preview for this file type
import scrapy
from datetime import date
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import scrapy, re
from datetime import datetime, timedelta, tzinfo
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para sonora (tiempo del pacifico): utc-7
return timedelta(hours=-7)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-7'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = date(int(year), int(month), int(day))
self.date = datetime(int(year),int(month),int(day),tzinfo=tz).isoformat('T')
self.baseURL='http://diariodelyaqui.mx/'+year+'/'+month+'/'+day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=self.baseURL, callback=self.parse)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//ul[@class="page-numbers"]/li/a/@href').extract()
if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/')
pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages):
if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else:
yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......@@ -60,10 +74,13 @@ class QuotesSpider(scrapy.Spider):
item['date'] = self.date
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['topic'] = response.xpath('//ul[@class="post-categories"]/li/a/text()').extract()
for paragraph in response.xpath('//div[@class="clearfix"]/p').extract():
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
No preview for this file type
from datetime import date
import scrapy
from datetime import date, datetime, timedelta, tzinfo, time
import scrapy, re
# scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para centro de mexico: utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -23,7 +34,9 @@ class NoticiasItem(scrapy.Item):
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -61,7 +74,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html':
......@@ -93,7 +107,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.html' or s == 'correo.html' or s == 'edito.php' or s == 'correo.php':
......@@ -117,7 +132,8 @@ class QuotesSpider(scrapy.Spider):
for s in section_list:
item = NoticiasItem()
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['topic'] = parse_s[s]
if s == 'edito.php' or s == 'correo.php':
......@@ -385,7 +401,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item_3(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="documentContent"]/h1[@class="title"]/text()').extract()
if ( len(title) > 0 ):
......@@ -410,7 +427,8 @@ class QuotesSpider(scrapy.Spider):
# path_list = ['//*[@class="col"]/p', '//*[@class="col col1"]/p', '//*[@class="col col2"]/p']
path_list = ['//*[@class="col"]', '//*[@class="col col1"]', '//*[@class="col col2"]']
item['date'] = self.date
# item['date'] = self.date
item['date'] = datetime.combine(self.date, time()).replace(tzinfo=self.tz).isoformat('T')
item['title'] = remove_tags(response.xpath('//*[@class="cabeza"]').extract_first())
item['topic'] = response.xpath('//*[@class="breadcrumb gui"]/span[2]/a/text()').extract_first()
......
No preview for this file type
import scrapy, re
from datetime import datetime
from datetime import datetime, timedelta, tzinfo
"""
Esta version se encarga de la descarga de la nueva pagina de La Jornada Guerrero
......@@ -16,6 +16,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para guerrero (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -29,6 +41,7 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -76,7 +89,9 @@ class QuotesSpider(scrapy.Spider):
d = d.replace(',','')
m = d[:d.find(' ')]
d = d.replace(m, self.parse_month[m])
item['date'] = datetime.strptime(d, '%m %d %Y').date()
# item['date'] = datetime.strptime(d, '%m %d %Y').date()
d = map(int, d.split(' '))
item['date'] = datetime(d[2],d[0],d[1],tzinfo=self.tz).isoformat('T')
title = response.xpath('//*[@class="itemHeader"]/h2/text()').extract_first()
if title is not None:
......@@ -92,6 +107,7 @@ class QuotesSpider(scrapy.Spider):
item['text'] = text
item['url'] = response.url
# print item['url']
yield item
No preview for this file type
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para veracruz (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL = 'http://www.jornadaveracruz.com.mx/'
self.builtURL= self.baseURL+'Archive.aspx?date='+self.day.zfill(2)+'/'+self.month.zfill(2)+'/'+self.year
urls = [
self.builtURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
self.builtURL= self.baseURL+'Archive.aspx?date='+day.zfill(2)+'/'+month.zfill(2)+'/'+year
yield scrapy.Request(url=self.builtURL, callback=self.parse)
def parse(self, response):
......@@ -59,10 +71,22 @@ class QuotesSpider(scrapy.Spider):
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['title'] = response.xpath('//h2[@class="article-title"]/text()').extract_first()
item['topic'] = response.xpath('//*[@class="content-article-title"]/h2/text()').extract()
item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
item['date'] = self.date
title = response.xpath('//h2[@class="article-title"]/text()').extract_first()
title = title.replace('\r','')
title = title.replace('\n','')
title = title.lstrip(' ')
title = title.rstrip(' ')
item['title'] = title
topic = response.xpath('//*[@class="content-article-title"]/h2/text()').extract_first()
topic = topic.replace('\r','')
topic = topic.replace('\n','')
topic = topic.lstrip(' ')
topic = topic.rstrip(' ')
item['topic'] = topic
# item['author'] = response.xpath('//*[@class="right-side"]/div/a[@rel="author"]/text()').extract_first()
paragraph = response.xpath('//*[@class="shortcode-content"]/p/text()').extract()
if ( len(paragraph) > 0 ):
......
No preview for this file type
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
import re
## scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=30
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://lectormx.com/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://lectormx.com/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.css('div.pagination').xpath('./ul/li/a/@href').extract()
......@@ -43,14 +59,16 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//h2[@class="title"]/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
text = ''
item = NoticiasItem()
item['date'] = self.year+'-'+self.month.zfill(2)+'-'+self.day.zfill(2)
item['date'] = self.date
item['title'] = response.xpath('//div[@class="single_post"]/header/h1/text()').extract_first()
item['topic'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[6]/a/text()').extract_first()
item['author'] = response.xpath('//div[@class="single_post"]/header/div[1]/div[2]/a/text()').extract_first()
......
No preview for this file type
import scrapy
import scrapy, re
from datetime import datetime, timedelta, tzinfo
#scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=3 -a day=22
import re
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para yucatan (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.year = getattr(self, 'year', None)
self.month = getattr(self, 'month', None)
self.day = getattr(self, 'day', None)
self.baseURL='http://gruporivas.com.mx/notirivas/'+self.year+'/'+self.month+'/'+self.day
urls = [
self.baseURL,
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
self.date = datetime(int(year), int(month), int(day), tzinfo=tz).isoformat('T')
self.baseURL='http://gruporivas.com.mx/notirivas/'+year+'/'+month+'/'+day
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
pagination = response.xpath('//*[@class="bdaia-pagination"]/span[@class="pages"]/text()').extract()
......@@ -43,15 +59,17 @@ class QuotesSpider(scrapy.Spider):
else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
def parse_page(self, response):
for link in response.xpath('//article/header/h2/a/@href').extract():
yield scrapy.Request(url=link, callback=self.parse_item)
yield scrapy.Request(url=link, callback=self.parse_item)
def parse_item(self, response):
item = NoticiasItem()
text = ''
item['date'] = self.date
item['title'] = response.xpath('//*[@class="bdaia-post-title"]/h1/span/text()').extract_first()
item['date'] = self.year+'-'+self.month+'-'+self.day
item['topic'] = response.xpath('//*[@class="bdaia-category"]/a/text()').extract_first()
content = response.xpath('//*[@class="bdaia-post-content"]/p/text()').extract()
......@@ -64,6 +82,7 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n'
item['text'] = text
item['url'] = response.url
# print item['title']
yield item
No preview for this file type
import scrapy, re, datetime
import scrapy, re
from datetime import datetime, timedelta, tzinfo
'''
scrapy crawl noticias -t json --nolog -o noticias.json -a year=2017 -a month=9 -a day=22
......@@ -10,6 +11,18 @@ def remove_tags(text):
return TAG_RE.sub('', text)
class UTC(tzinfo):
"""clase para el 'time zone' (zona horaria)"""
def utcoffset(self, dt):
# zona horaria para hidalgo (centro de mexico): utc-6
return timedelta(hours=-6)
def tzname(self, dt):
# nombre de la zona horaria
return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
......@@ -24,6 +37,7 @@ class QuotesSpider(scrapy.Spider):
name = "noticias"
def start_requests(self):
self.tz = UTC()
year = getattr(self, 'year', None)
month = getattr(self, 'month', None)
day = getattr(self, 'day', None)
......@@ -77,7 +91,7 @@ class QuotesSpider(scrapy.Spider):
d = response.xpath('//p[@class="post-meta"]/span/text()').extract_first()
d = d.replace(',','').split(' ')
item['date'] = datetime.date(int(d[2]), self.date_parser[d[0].lower()], int(d[1]))
item['date'] = datetime(int(d[2]), self.date_parser[d[0].lower()], int(d[1]), tzinfo=self.tz).isoformat('T')
item['topic'] = response.xpath('//span[@typeof="v:Breadcrumb"]/a/text()').extract()[1]
item['title'] = response.xpath('//*[@class="post-inner"]/h1/span/text()').extract_first()
......
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment