noticiero en linea

067f09e9 · Mario Chirinos Colunga · 57f362cd · 067f09e9
Commit 067f09e9 authored Apr 15, 2020 by Mario Chirinos Colunga 💬
Hide whitespace changes
Inline Side-by-side

Showing with 9 additions and 5 deletions

noticias.py ...por_dia/noticieroLinea/noticieroLinea/spiders/noticias.py +9 -5

No files found.
--- a/descarga_por_dia/noticieroLinea/noticieroLinea/spiders/noticias.py
+++ b/descarga_por_dia/noticieroLinea/noticieroLinea/spiders/noticias.py
@@ -8,15 +8,18 @@ Noticiero en Línea, Colima
 USO:
 scrapy crawl noticias --nolog -s filename=2018-01-03.json -a year=2018 -a month=1 -a day=3
 """
-
+from dateparser import parse
 TAG_RE = re.compile(r'<[^>]+>')
-def remove_tags(text):
-    return TAG_RE.sub('', text)
-
 LOC_RE = re.compile(r'\A.+?(\d{1,2}[\s-][a-zA-Z]+[\s-]\d{4})?\s?\.\s?-\s?', re.S)
 G_RE = re.compile(r'\s?-\s?')
 E_RE = re.compile(r'\(Con informaci.*?\)\.?')

+#-------------------------------------------------------------------------------
+def remove_tags(text):
+    return TAG_RE.sub('', text)
+#-------------------------------------------------------------------------------
+
+
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -44,7 +47,8 @@ class QuotesSpider(scrapy.Spider):
        text = ''

        "La fecha obtenida ya incluye formato y zona horaria"
-        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        date = response.css("span.entry-meta-date::text").get().lower()
+        item['date'] = parse(date, date_formats=["%d %B, %Y"]).isoformat()#response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
        item['title'] = remove_tags(response.xpath('//h1[@class="entry-title"]').extract_first()).strip()

        item['topic'] = response.xpath('//*[@class="entry-tags clearfix"]/a/text()').extract_first()