crawlers

2a9d7bdc · Renán Sosa Guillen · d60a1a0a · 2a9d7bdc · 2a9d7bdc
Commit 2a9d7bdc authored May 02, 2018 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 133 additions and 82 deletions

noticias.py ..._atras/foraneos/elSalvador/elSalvador/spiders/noticias.py +133 -82

noticias.pyc ...atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc +0 -0

No files found.
--- a/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.py
+++ b/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.py
 # -*- coding: utf-8 -*-
-import scrapy, re, json
+import scrapy, re, json, ast
+from scrapy.selector import Selector
 from datetime import datetime, date
 from elSalvador.items import NoticiasItem
@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider):
        baseURL = "http://www.elsalvador.com/category/noticias/"
        # sectionList = []
-        sectionList = ["nacional"]
+        sectionList = ["internacional"]
-        # if self.stopDate is None:
+        if self.stopDate is None:
-        #     for s in sectionList:
+            for s in sectionList:
-        #         info = ImportantData()
+                info = ImportantData()
-        #         info['page'] = 1
+                info['page'] = 1
-        #         request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
-        #         request.meta['item'] = info
+                request.meta['item'] = info
-        #         yield request
+                yield request
-        #
-        # else:
+        else:
-        #     for s in sectionList:
+            for s in sectionList:
-        #         info = ImportantData()
+                info = ImportantData()
-        #         info['page'] = 1
+                info['page'] = 0
-        #         info['CONTINUE_SEARCHING'] = False
+                info['CONTINUE_SEARCHING'] = False
-        #         request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
+                request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
-        #         request.meta['item'] = info
+                request.meta['item'] = info
-        #         yield request
+                yield request
-        for s in sectionList:
+        # for s in sectionList:
-            yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
+        #     yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
    def parse(self, response):
@@ -109,82 +110,132 @@ class QuotesSpider(scrapy.Spider):
        #     yield request
        linkList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
        linkList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
-        # for link in linkList:
+        for link in linkList:
-        #     print link
-        url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
-        url_peticion = "/category/noticias/nacional/"
-        frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "nacional", 'paged': "1", 'category_name': "Nacional", 'url_peticion': url_peticion}
-        yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
-    def after_post(self, response):
-        # from scrapy.shell import inspect_response
-        import ast
-        from scrapy.selector import Selector
-        print "This is response: "
-        unescaped = ast.literal_eval(response.body.strip())
-        body = Selector(text=unescaped)
-        # inspect_response(response, self)
-        newsList = []
-        for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
-            link = link.replace('\\', '')
-            if not link in newsList:
-                newsList.append(link)
-        for link in newsList:
            print link
+        # url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
+        # url_peticion = "/category/noticias/internacional/"
+        # frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': "4526", 'category_name': "Internacional", 'url_peticion': url_peticion}
+        #
+        # yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
+    # def after_post(self, response):
+    #     searchData = response.meta['item']
+    #     # from scrapy.shell import inspect_response
+    #     # print "This is response: "
+    #     unescaped = ast.literal_eval(response.body.strip())
+    #     body = Selector(text=unescaped)
+    #     # inspect_response(response, self)
+    #     newsList = []
+    #     linksObtained = body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract()
+    #     for link in linksObtained:
+    #         link = link.replace('\\', '')
+    #         if not link in newsList:
+    #             newsList.append(link)
+    #
+    #     # print len(newsList) checar length de newList para determinar el paro
+    #     if len(newsList) > 0:
+    #         for link in newsList:
+    #             info = ImportantData()
+    #             info['url'] = searchData['url']
+    #             info['page'] = searchData['page']
+    #             info['section_url'] = searchData['section_url']
+    #             if link == linkList[-1]: info['LAST_LINK'] = True
+    #             else: info['LAST_LINK'] = False
+    #             reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+    #             reqst.meta['item'] = info
+    #             yield reqst
    def parse_with_stop_date(self, response):
        searchData = response.meta['item']
        CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
        if not CONTINUE_SEARCHING:
-            if searchData['page'] == 1:
+            if searchData['page'] == 0:
                searchData['section_url'] = response.url
-                linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
+                newsList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
-                linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
+                # newsList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
-                linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
-                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
-                linkList.remove(searchData['section_url'])
            else:
-                linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
+                unescaped = ast.literal_eval(response.body.strip())
-                linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+                body = Selector(text=unescaped)
-                try:
-                    linkList.remove(searchData['section_url'])
+                newsList = []
-                except KeyError:
+                for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
-                    pass
+                    link = link.replace('\\', '')
+                    if not link in newsList:
-            newsList = []
+                        newsList.append(link)
-            for link in linkList:
-                if not link in newsList:
+            if len(newsList) > 0:
-                    newsList.append(link)
+                for link in newsList:
+                    info = ImportantData()
-            for link in newsList:
+                    # info['url'] = response.url
-                info = ImportantData()
+                    info['page'] = searchData['page']
-                info['url'] = response.url
+                    info['section_url'] = searchData['section_url']
-                info['page'] = searchData['page']
+                    if link == newsList[-1]: info['LAST_LINK'] = True
-                info['section_url'] = searchData['section_url']
+                    else: info['LAST_LINK'] = False
-                if link == linkList[-1]: info['LAST_LINK'] = True
+                    reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
-                else: info['LAST_LINK'] = False
+                    reqst.meta['item'] = info
-                reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+                    yield reqst
-                reqst.meta['item'] = info
-                yield reqst
+            # if searchData['page'] == 1:
+            #     searchData['section_url'] = response.url
+            #     linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
+            #     linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
+            #     linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+            #     linkList.remove(searchData['section_url'])
+            #
+            # else:
+            #     linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
+            #     linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
+            #     try:
+            #         linkList.remove(searchData['section_url'])
+            #     except KeyError:
+            #         pass
+            #
+            # newsList = []
+            # for link in linkList:
+            #     if not link in newsList:
+            #         newsList.append(link)
+            #
+            # for link in newsList:
+            #     info = ImportantData()
+            #     info['url'] = response.url
+            #     info['page'] = searchData['page']
+            #     info['section_url'] = searchData['section_url']
+            #     if link == linkList[-1]: info['LAST_LINK'] = True
+            #     else: info['LAST_LINK'] = False
+            #     reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
+            #     reqst.meta['item'] = info
+            #     yield reqst
        else:
            searchData['CONTINUE_SEARCHING'] = False
            searchData['page'] += 1
-            page = searchData['page']
+            page = str(searchData['page'])
-            url = searchData['section_url']
-            request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
+            url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
+            url_peticion = "/category/noticias/internacional/"
+            frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': page,
+                       'category_name': "Internacional", 'url_peticion': url_peticion}
+            request = scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.parse_with_stop_date)
            request.meta['item'] = searchData
            yield request
+            # searchData['CONTINUE_SEARCHING'] = False
+            # searchData['page'] += 1
+            # page = searchData['page']
+            # url = searchData['section_url']
+            # request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
+            # request.meta['item'] = searchData
+            # yield request
    def parse_item(self, response):
        item = NoticiasItem()
@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider):
    def parse_item_with_stop_date(self, response):
-        d = response.xpath('//time/text()').extract_first()
+        d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
-        dt = datetime.strptime(d, '%d.%m.%Y').date()
+        dt = datetime.strptime(d[:10], '%Y-%m-%d').date()
        if dt >= self.stopDate:
            info = response.meta['item']
@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider):
            if info['LAST_LINK']:
                info['CONTINUE_SEARCHING'] = True
-                request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True)
+                request = scrapy.Request(url=info['section_url'], callback=self.parse_with_stop_date, dont_filter=True)
                request.meta['item'] = info
                yield request
--- a/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc
+++ b/descarga_hacia_atras/foraneos/elSalvador/elSalvador/spiders/noticias.pyc