crawlers

15cbb498 · Renán Sosa Guillen · 687a5463 · 15cbb498 · 15cbb498
Commit 15cbb498 authored Oct 17, 2018 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 82 additions and 42 deletions

noticias.py ...oraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py +44 -18

noticias.py ...rga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py +38 -24

No files found.
--- a/descarga_por_dia/foraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py
+++ b/descarga_por_dia/foraneos/diarioCoLatino/diarioCoLatino/spiders/noticias.py
 # -*- coding: utf-8 -*-
-import scrapy, re
-from diarioCoLatino.items import NoticiasItem

 """
-MEDIO:
-Diario Co Latino, El Salvador
-USO:
-scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
+MEDIA:
+    Diario Co Latino, El Salvador
+
+USAGE:
+    ## Get all the news from a specific date. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd diarioCoLatino/
+    $ scrapy crawl noticias --nolog -s filename=2018-02-23.json -a year=2018 -a month=2 -a day=23
 """

+import scrapy, re
+from diarioCoLatino.items import NoticiasItem
+
+
 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
    return TAG_RE.sub('', text)
@@ -20,9 +26,14 @@ LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
 EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')


+
 class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
    name = "noticias"

+
    def start_requests(self):
        year = getattr(self, "year", None)
        month = getattr(self, "month", None)
@@ -33,6 +44,7 @@ class QuotesSpider(scrapy.Spider):
        yield scrapy.Request(url=self.baseURL, callback=self.parse)


+
    def parse(self, response):
        yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)

@@ -45,20 +57,22 @@ class QuotesSpider(scrapy.Spider):
                yield scrapy.Request(url=self.baseURL + "/page/" + str(page + 1), callback=self.parse_page)


+
    def parse_page(self, response):
-        for link in response.xpath('//div[@class="post-listing"]/article/h2/a/@href').extract():
+        for link in response.css('div.content').css('div.post-listing').xpath('./article/h2/a/@href').extract():
            yield scrapy.Request(url=link, callback=self.parse_item)


+
    def parse_item(self, response):
        item = NoticiasItem()
        text = ''

-        "La fecha obtenida ya incluye formato y zona horaria"
-        item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-        item['title'] = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()
+        # La fecha obtenida ya incluye formato y zona horaria
+        news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        news_title = remove_tags(response.css('h1.entry-title').css('span').extract_first()).strip()

-        item['topic'] = None
+        news_topic = None

        for p in response.xpath('//div[@class="entry"]/p').extract():
            text += remove_tags(p) + "\n"
@@ -69,27 +83,30 @@ class QuotesSpider(scrapy.Spider):

        text = "\n" + text
        """ Obtiene autor """
+        news_author = None
        res = AUTH_RE.match(text)
        if res:
            m = res.group(0)
-            item['author'] = m[m.find('Por')+len('Por'):].strip()
+            news_author = m[m.find('Por')+len('Por'):].strip()
            text = text[text.find(m) + len(m):].strip()
            text = "\n" + text

        """ Elimina twitter """
+        news_twitter = None
        res = TW_RE.search(text)
        if res:
            m = res.group(0)
-            item['twitter'] = m.strip()
+            news_twitter = m.strip()
            text = text[text.find(m) + len(m):].strip()
            text = "\n" + text

        """ Obtiene lugar """
+        news_loc = None
        res = LOC_RE.match(text)
        if res:
            m = res.group(0)
            if m[m.find('/') + 1:].strip().lower() != 'dpa':
-                item['location'] = m[:m.find('/')].strip()
+                news_loc = m[:m.find('/')].strip()
                text = text[text.find(m) + len(m):].strip()
                text = "\n" + text
            else:
@@ -97,10 +114,11 @@ class QuotesSpider(scrapy.Spider):
                text = "\n" + text

        """ Elimina correo """
+        news_email = None
        res = EM_RE.search(text)
        if res:
            m = res.group(0)
-            item['email'] = m.strip()
+            news_email = m.strip()
            # text = text[text.find(m) + len(m):].strip()
            text = text.replace(m, '').strip()
            text = "\n" + text
@@ -108,7 +126,7 @@ class QuotesSpider(scrapy.Spider):
        res = EM_RE.search(text)
        if res:
            m = res.group(0)
-            item['email'] = m.strip()
+            news_email = m.strip()
            # text = text[text.find(m) + len(m):].strip()
            text = text.replace(m, '').strip()
            text = "\n" + text
@@ -119,8 +137,16 @@ class QuotesSpider(scrapy.Spider):
        text = "\n" + text
        text = text.replace("\nCo Latino\n", '').strip()

-        item['text'] = text.strip()

-        item['url'] = response.url
+        ## News item info ##
+        item['date']   = news_date
+        item['title']  = news_title
+        item['topic']  = news_topic
+        item['author'] = news_author
+        item['twitter'] = news_twitter
+        item['location'] = news_loc
+        item['email'] = news_email
+        item['text']  = text.strip()
+        item['url']   = response.url

        yield item
--- a/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/descarga_por_rss/elFinanciero/elFinanciero/spiders/noticias.py
 # -*- coding: utf-8 -*-
+
+"""
+MEDIA:
+    El Financiero, CDMX
+
+USAGE:
+    ## Get the news from RSS. ##
+    ---------------------------------------------------------------------------------------------
+    $ cd elFinanciero/
+    $ scrapy crawl noticias --nolog -s filename=2017-12-20.json
+"""
+
 import scrapy, re, json
 from elFinanciero.items import NoticiasItem
 from datetime import datetime, timedelta, tzinfo

-"""
-MEDIO:
-El Financiero, CDMX
-USO:
-scrapy crawl noticias --nolog -s filename=2017-12-20.json
-"""

 TAG_RE = re.compile(r'<[^>]+>')
 def remove_tags(text):
@@ -16,65 +22,73 @@ def remove_tags(text):


 class UTC(tzinfo):
-    """clase para el 'time zone' (zona horaria)"""
-
+    """
+    Class for Time Zone
+    """
    def utcoffset(self, dt):
-        # zona horaria para hidalgo (centro de mexico): utc-6
+        ## Time zone for CDMX: UTC-6 ##
        return timedelta(hours=-6)

    def tzname(self, dt):
-        # nombre de la zona horaria
+        ## Time zone name ##
        return 'UTC-6'


 class QuotesSpider(scrapy.Spider):
+    """
+    Basic Scrapy Spider class
+    """
    name = "noticias"

+
    def start_requests(self):
        self.tz = UTC()
-        # self.date_parser = {'enero': 1,      'febrero': 2,  'marzo': 3,      'abril': 4,
-        #                     'mayo': 5,       'junio': 6,    'julio': 7,      'agosto': 8,
-        #                     'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
-
        self.baseURL = "http://www.elfinanciero.com.mx/rss"
+
        yield scrapy.Request(url=self.baseURL, callback=self.parse)


+
    def parse(self, response):
        for link in response.xpath('//link/text()').extract()[1:]:
            yield scrapy.Request(url=link, callback=self.parse_item)


+
    def parse_item(self, response):
        item = NoticiasItem()
        text = ''

-        res = remove_tags(response.xpath('//script[@type="application/ld+json"]').extract_first())
+        res = response.xpath('//script[@data-schema="NewsArticle"]').extract_first()
+        if res is not None : res = remove_tags(res)
        resDict = json.loads(res)
        dt = resDict['datePublished']
        d,t = dt.split()
        d = map(int, d.split("-"))
        t = map(int, t.split(":"))
-        item['date'] = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")
+        news_date = datetime(d[0], d[1], d[2], t[0], t[1], t[2], tzinfo=self.tz).isoformat("T")

-        item['title'] = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()
+        title = remove_tags(response.css('div.column').css('div.column').css('h1').extract_first()).strip()

        topic = response.xpath('//div[@class="section-line"]').extract_first()
        if topic is not None:
-            item['topic'] = remove_tags(topic)
-        else:
-            item['topic'] = None
+            topic = remove_tags(topic)

        author = response.xpath('//div[@class="note-author"]/a').extract_first()
        if author is not None:
-            item['author'] = remove_tags(author)
+            author = remove_tags(author)

        for p in response.css('div.content').css('p').extract():
            text += remove_tags(p) + '\n'
-        item['text'] = text.strip()

-        item['url'] = response.url

-        # print item['title']
+        ## News item info ##
+        item['date']  = news_date
+        item['title'] = title
+        item['topic'] = topic
+        item['author'] = author
+        item['text']  = text.strip()
+        item['url']  = response.url
+
        yield item