crawlers

605abb9c · Renán Sosa Guillen · d3103297 · 605abb9c · 605abb9c · 605abb9c
Commit 605abb9c authored Mar 05, 2018 by Renán Sosa Guillen
7 changed files
--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/__init__.pyc
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/__init__.pyc
--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/items.pyc
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/items.pyc
--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/pipelines.pyc
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/pipelines.pyc
--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/settings.pyc
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/settings.pyc
--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/spiders/__init__.pyc
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/spiders/__init__.pyc
--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/spiders/noticias.py
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/spiders/noticias.py
@@ -13,13 +13,10 @@ TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
-# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
-# G_RE = re.compile(r' ?- ?')
+DATE_RE1 = re.compile(r'(-|- )?([A-Z][a-z]+, ?)?[A-Z][a-z]+( \d{1,2})?, \d{4}( -|\n)? ?[A-Z]')
-# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
+DATE_RE2 = re.compile(r', [a-zA-Z]+ \d{1,2} -( -)?')
-# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
+AUTH_RE = re.compile(r'\n(- )?By.+\n')
-# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
-# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
-# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
 class importantData(scrapy.Item):
@@ -90,19 +87,30 @@ class QuotesSpider(scrapy.Spider):
        for p in response.xpath('//div[@class="entry"]').css('p').extract():
            text += remove_tags(p) + "\n"
-        # result = LOC_RE.search(text)
+        text = text.strip()
-        # if result:
+        text = "\n" + text
-        #     m = result.group(0)
+        text = text.replace(u'\u2013', "-")
-        #     location = G_RE.sub('', m).strip()
+        text = text.replace(u'\u00a0', '')  ## Elimina 'no-break spaces'
-        #     if len(location) <= 35:
-        #         item['location'] = location
+        res = DATE_RE1.search(text)
-        #         text = text[text.find(m)+len(m):]
+        if res:
+            m = res.group(0)[:-1]
-        # text = EM_RE.sub('', text)
+            text = text[text.find(m) + len(m):].strip()
-        # text = TW_RE.sub('', text)
+            text = "\n" + text
-        # text = TW2_RE.sub('', text)
-        # text = TAG2_RE.sub("\n", text)
+        res = DATE_RE2.search(text)
-        # text = TAG3_RE.sub('', text)
+        if res:
+            m = res.group(0)[:-1]
+            text = text[text.find(m) + len(m):].strip()
+            text = "\n" + text
+        res = AUTH_RE.match(text)
+        if res:
+            m = res.group(0)
+            text = text[text.find(m) + len(m):].strip()
+            text = "\n" + text
+        text = text.replace("Follow The San Pedro Sun News on Twitter, become a fan on Facebook. Stay updated via RSS", '')
        item['text'] = text.strip()
        item['url'] = response.url

--- a/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/spiders/noticias.pyc
+++ b/descarga_por_dia/foraneos/sanPedroSun/sanPedroSun/spiders/noticias.pyc