crawlers

f6e99c05 · Renán Sosa Guillen · 32d4ecab · f6e99c05 · f6e99c05 · f6e99c05
Commit f6e99c05 authored Mar 02, 2018 by Renán Sosa Guillen
4 changed files
--- a/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/items.py
+++ b/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/items.py
@@ -18,3 +18,5 @@ class NoticiasItem(scrapy.Item):
    author = scrapy.Field()
    topic = scrapy.Field()
    url = scrapy.Field()
+    twitter = scrapy.Field()
+    email = scrapy.Field()
--- a/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/items.pyc
+++ b/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/items.pyc
--- a/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/spiders/noticias.py
+++ b/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/spiders/noticias.py
@@ -13,14 +13,18 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)

+
 # LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
 # G_RE = re.compile(r' ?- ?')
-# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
 # TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
-# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
 # TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
 # TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
-
+AUTH_RE = re.compile(r'\nPor.+?\n')
+TW_RE = re.compile(r'(\n(\| )?Twitter:\s+)?(@[\w.%+-]+.)', re.I)
+LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA)', re.I)
+EM_RE = re.compile(r'(\n(Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
+#** correo pasarlo dos veces seguidas
+#Frases a quitar: '\nFotografías\n', '\nDiario Co Latino\n'
 class QuotesSpider(scrapy.Spider):
    name = "noticias"

@@ -68,6 +72,49 @@ class QuotesSpider(scrapy.Spider):
            for p in response.xpath('//div[@class="entry"]/div/span').extract():
                text += remove_tags(p) + "\n"

+        text = "\n" + text
+        """ Obtiene autor """
+        res = AUTH_RE.search(text)
+        if res:
+            m = res.group(0)
+            item['author'] = m[m.find('Por')+len('Por'):].strip()
+            text = text[text.find(m) + len(m):].strip()
+            text = "\n" + text
+
+        """ Elimina twitter """
+        res = TW_RE.search(text)
+        if res:
+            m = res.group(0)
+            item['twitter'] = m.strip()
+            text = text[text.find(m) + len(m):].strip()
+            text = "\n" + text
+
+        """ Obtiene lugar """
+        res = LOC_RE.search(text)
+        if res:
+            m = res.group(0)
+            if m[m.find('/') + 1:].strip().lower() != 'dpa':
+                item['location'] = m[:m.find('/')].strip()
+                text = text[text.find(m) + len(m):].strip()
+                text = "\n" + text
+
+        """ Elimina correo """
+        res = EM_RE.search(text)
+        if res:
+            m = res.group(0)
+            item['email'] = m.strip()
+            text = text[text.find(m) + len(m):].strip()
+            text = "\n" + text
+
+        res = EM_RE.search(text)
+        if res:
+            m = res.group(0)
+            item['email'] = m.strip()
+            text = text[text.find(m) + len(m):].strip()
+            text = "\n" + text
+
+        text[text.find("\nDiario Co Latino\n") + len("\nDiario Co Latino\n")]
+
        # result = LOC_RE.search(text)
        # if result:
        #     m = result.group(0)

--- a/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/spiders/noticias.pyc
+++ b/descarga_por_dia/paisElSalvador/diarioCoLatino/diarioCoLatino/spiders/noticias.pyc