Commit 067f09e9 authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

noticiero en linea

parent 57f362cd
......@@ -8,15 +8,18 @@ Noticiero en Línea, Colima
USO:
scrapy crawl noticias --nolog -s filename=2018-01-03.json -a year=2018 -a month=1 -a day=3
"""
from dateparser import parse
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
LOC_RE = re.compile(r'\A.+?(\d{1,2}[\s-][a-zA-Z]+[\s-]\d{4})?\s?\.\s?-\s?', re.S)
G_RE = re.compile(r'\s?-\s?')
E_RE = re.compile(r'\(Con informaci.*?\)\.?')
#-------------------------------------------------------------------------------
def remove_tags(text):
return TAG_RE.sub('', text)
#-------------------------------------------------------------------------------
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -44,7 +47,8 @@ class QuotesSpider(scrapy.Spider):
text = ''
"La fecha obtenida ya incluye formato y zona horaria"
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
date = response.css("span.entry-meta-date::text").get().lower()
item['date'] = parse(date, date_formats=["%d %B, %Y"]).isoformat()#response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['title'] = remove_tags(response.xpath('//h1[@class="entry-title"]').extract_first()).strip()
item['topic'] = response.xpath('//*[@class="entry-tags clearfix"]/a/text()').extract_first()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment