crawlers

808146d9 · Renán Sosa Guillen · ee680ed4 · 808146d9 · 808146d9 · 808146d9
Commit 808146d9 authored Mar 05, 2018 by Renán Sosa Guillen
4 changed files
--- a/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/spiders/noticias.py
+++ b/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/spiders/noticias.py
@@ -14,17 +14,12 @@ def remove_tags(text):
    return TAG_RE.sub('', text)


-# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
-# G_RE = re.compile(r' ?- ?')
-# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
-# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
-# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
 AUTH_RE = re.compile(r'\nPor.+?\n')
 TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I)
 LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
 EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
-#** correo pasarlo dos veces seguidas
-#Frases a quitar: '\nFotografías\n', '\nDiario Co Latino\n'
+
+
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -124,19 +119,6 @@ class QuotesSpider(scrapy.Spider):
        text = "\n" + text
        text = text.replace("\nCo Latino\n", '').strip()

-        # result = LOC_RE.search(text)
-        # if result:
-        #     m = result.group(0)
-        #     location = G_RE.sub('', m).strip()
-        #     if len(location) <= 35:
-        #         item['location'] = location
-        #         text = text[text.find(m)+len(m):]
-
-        # text = EM_RE.sub('', text)
-        # text = TW_RE.sub('', text)
-        # text = TW2_RE.sub('', text)
-        # text = TAG2_RE.sub("\n", text)
-        # text = TAG3_RE.sub('', text)
        item['text'] = text.strip()

        item['url'] = response.url

--- a/descarga_por_dia/paisHonduras/tiempoDigitalHn/tiempoDigitalHn/settings.pyc
+++ b/descarga_por_dia/paisHonduras/tiempoDigitalHn/tiempoDigitalHn/settings.pyc
--- a/descarga_por_dia/paisHonduras/tiempoDigitalHn/tiempoDigitalHn/spiders/noticias.py
+++ b/descarga_por_dia/paisHonduras/tiempoDigitalHn/tiempoDigitalHn/spiders/noticias.py
@@ -13,13 +13,9 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)

-# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
-# G_RE = re.compile(r' ?- ?')
-# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
-# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
-# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
-# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
-# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
+LOC_RE1 = re.compile(r'\n([A-Z]+ )+ ?[.-]')
+LOC_RE2 = re.compile(r'\n.+?,? ?.+? ?(\. ?-|\.|-) ?[A-Z]')
+SOURCE_RE = re.compile(r'\n ?Fuente:.+$')

 class QuotesSpider(scrapy.Spider):
    name = "noticias"
@@ -76,19 +72,35 @@ class QuotesSpider(scrapy.Spider):
        for p in response.xpath('//div[@class="td-post-content"]').css('p').extract():
            text += remove_tags(p) + "\n"

-        # result = LOC_RE.search(text)
-        # if result:
-        #     m = result.group(0)
-        #     location = G_RE.sub('', m).strip()
-        #     if len(location) <= 35:
-        #         item['location'] = location
-        #         text = text[text.find(m)+len(m):]
-
-        # text = EM_RE.sub('', text)
-        # text = TW_RE.sub('', text)
-        # text = TW2_RE.sub('', text)
-        # text = TAG2_RE.sub("\n", text)
-        # text = TAG3_RE.sub('', text)
+        text = text.strip()
+        text = "\n" + text
+        text = text.replace(u'\u2013', "-")
+        text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
+
+        res = LOC_RE1.match(text)
+        if res:
+            m = res.group(0)[:-1]
+            location = m.replace("-", '').strip()
+            if len(location) <= 25:
+                item['location'] = location
+                text = text.replace(m, '').strip()
+                text = "\n" + text
+
+        res = LOC_RE2.match(text)
+        if res:
+            m = res.group(0)[:-1]
+            location = m.replace("-", '').replace(".", '').strip()
+            if len(location) <= 25:
+                item['location'] = location
+                text = text.replace(m, '').strip()
+                text = "\n" + text
+
+        res = SOURCE_RE.search(text)
+        if res:
+            m = res.group(0)
+            text = text.replace(m, '').strip()
+            text = "\n" + text
+
        item['text'] = text.strip()

        item['url'] = response.url

--- a/descarga_por_dia/paisHonduras/tiempoDigitalHn/tiempoDigitalHn/spiders/noticias.pyc
+++ b/descarga_por_dia/paisHonduras/tiempoDigitalHn/tiempoDigitalHn/spiders/noticias.pyc