crawlers

2a9d7bdc · Renán Sosa Guillen · d60a1a0a · 2a9d7bdc · 2a9d7bdc
Commit 2a9d7bdc authored May 02, 2018 by Renán Sosa Guillen
Show whitespace changes
Inline Side-by-side

Showing with 133 additions and 82 deletions

noticias.py ..._atras/foraneos/elSalvador/elSalvador/spiders/noticias.py +133 -82

noticias.pyc ...atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc +0 -0

No files found.
--- a/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.py
+++ b/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.py
 # -*- coding: utf-8 -*-
-import scrapy, re, json
+import scrapy, re, json, ast
+from scrapy.selector import Selector
 from datetime import datetime, date
 from elSalvador.items import NoticiasItem

@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider):
        baseURL = "http://www.elsalvador.com/category/noticias/"

        # sectionList = []
-        sectionList = ["nacional"]
+        sectionList = ["internacional"]

-        # if self.stopDate is None:
-        #     for s in sectionList:
-        #         info = ImportantData()
-        #         info['page'] = 1
-        #         request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
-        #         request.meta['item'] = info
-        #         yield request
-        #
-        # else:
-        #     for s in sectionList:
-        #         info = ImportantData()
-        #         info['page'] = 1
-        #         info['CONTINUE_SEARCHING'] = False
-        #         request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
-        #         request.meta['item'] = info
-        #         yield request
+        if self.stopDate is None:
            for s in sectionList:
-            yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
+                info = ImportantData()
+                info['page'] = 1
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
+                request.meta['item'] = info
+                yield request
+
+        else:
+            for s in sectionList:
+                info = ImportantData()
+                info['page'] = 0
+                info['CONTINUE_SEARCHING'] = False
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
+                request.meta['item'] = info
+                yield request
+        # for s in sectionList:
+        #     yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)


    def parse(self, response):
@@ -109,32 +110,42 @@ class QuotesSpider(scrapy.Spider):
        #     yield request
        linkList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
        linkList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
-        # for link in linkList:
-        #     print link
-
-        url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
-        url_peticion = "/category/noticias/nacional/"
-        frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "nacional", 'paged': "1", 'category_name': "Nacional", 'url_peticion': url_peticion}
-
-        yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
+        for link in linkList:
+            print link

+        # url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
+        # url_peticion = "/category/noticias/internacional/"
+        # frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': "4526", 'category_name': "Internacional", 'url_peticion': url_peticion}
+        #
+        # yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)

-    def after_post(self, response):
-        # from scrapy.shell import inspect_response
-        import ast
-        from scrapy.selector import Selector
-        print "This is response: "
-        unescaped = ast.literal_eval(response.body.strip())
-        body = Selector(text=unescaped)
-        # inspect_response(response, self)
-        newsList = []
-        for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
-            link = link.replace('\\', '')
-            if not link in newsList:
-                newsList.append(link)

-        for link in newsList:
-            print link
+    # def after_post(self, response):
+    #     searchData = response.meta['item']
+    #     # from scrapy.shell import inspect_response
+    #     # print "This is response: "
+    #     unescaped = ast.literal_eval(response.body.strip())
+    #     body = Selector(text=unescaped)
+    #     # inspect_response(response, self)
+    #     newsList = []
+    #     linksObtained = body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract()
+    #     for link in linksObtained:
+    #         link = link.replace('\\', '')
+    #         if not link in newsList:
+    #             newsList.append(link)
+    #
+    #     # print len(newsList) checar length de newList para determinar el paro
+    #     if len(newsList) > 0:
+    #         for link in newsList:
+    #             info = ImportantData()
+    #             info['url'] = searchData['url']
+    #             info['page'] = searchData['page']
+    #             info['section_url'] = searchData['section_url']
+    #             if link == linkList[-1]: info['LAST_LINK'] = True
+    #             else: info['LAST_LINK'] = False
+    #             reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+    #             reqst.meta['item'] = info
+    #             yield reqst


    def parse_with_stop_date(self, response):
@@ -142,49 +153,89 @@ class QuotesSpider(scrapy.Spider):
        CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']

        if not CONTINUE_SEARCHING:
-            if searchData['page'] == 1:
+            if searchData['page'] == 0:
                searchData['section_url'] = response.url
-                linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
-                linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
-                linkList.remove(searchData['section_url'])
+                newsList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
+                # newsList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())

            else:
-                linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
-                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
-                try:
-                    linkList.remove(searchData['section_url'])
-                except KeyError:
-                    pass
+                unescaped = ast.literal_eval(response.body.strip())
+                body = Selector(text=unescaped)

                newsList = []
-            for link in linkList:
+                for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
+                    link = link.replace('\\', '')
                    if not link in newsList:
                        newsList.append(link)

+            if len(newsList) > 0:
                for link in newsList:
                    info = ImportantData()
-                info['url'] = response.url
+                    # info['url'] = response.url
                    info['page'] = searchData['page']
                    info['section_url'] = searchData['section_url']
-                if link == linkList[-1]: info['LAST_LINK'] = True
+                    if link == newsList[-1]: info['LAST_LINK'] = True
                    else: info['LAST_LINK'] = False
                    reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
                    reqst.meta['item'] = info
                    yield reqst

+            # if searchData['page'] == 1:
+            #     searchData['section_url'] = response.url
+            #     linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
+            #     linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+            #     linkList.remove(searchData['section_url'])
+            #
+            # else:
+            #     linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
+            #     linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+            #     try:
+            #         linkList.remove(searchData['section_url'])
+            #     except KeyError:
+            #         pass
+            #
+            # newsList = []
+            # for link in linkList:
+            #     if not link in newsList:
+            #         newsList.append(link)
+            #
+            # for link in newsList:
+            #     info = ImportantData()
+            #     info['url'] = response.url
+            #     info['page'] = searchData['page']
+            #     info['section_url'] = searchData['section_url']
+            #     if link == linkList[-1]: info['LAST_LINK'] = True
+            #     else: info['LAST_LINK'] = False
+            #     reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+            #     reqst.meta['item'] = info
+            #     yield reqst
+
        else:
            searchData['CONTINUE_SEARCHING'] = False
            searchData['page'] += 1
-            page = searchData['page']
-            url = searchData['section_url']
-            request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
+            page = str(searchData['page'])
+
+            url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
+            url_peticion = "/category/noticias/internacional/"
+            frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': page,
+                       'category_name': "Internacional", 'url_peticion': url_peticion}
+
+            request = scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.parse_with_stop_date)
            request.meta['item'] = searchData
            yield request

+            # searchData['CONTINUE_SEARCHING'] = False
+            # searchData['page'] += 1
+            # page = searchData['page']
+            # url = searchData['section_url']
+            # request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
+            # request.meta['item'] = searchData
+            # yield request
+

    def parse_item(self, response):
        item = NoticiasItem()
@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider):


    def parse_item_with_stop_date(self, response):
-        d = response.xpath('//time/text()').extract_first()
-        dt = datetime.strptime(d, '%d.%m.%Y').date()
+        d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
+        dt = datetime.strptime(d[:10], '%Y-%m-%d').date()

        if dt >= self.stopDate:
            info = response.meta['item']
@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider):

            if info['LAST_LINK']:
                info['CONTINUE_SEARCHING'] = True
-                request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request = scrapy.Request(url=info['section_url'], callback=self.parse_with_stop_date, dont_filter=True)
                request.meta['item'] = info
                yield request
--- a/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc