Commit 2a9d7bdc authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent d60a1a0a
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import scrapy, re, json import scrapy, re, json, ast
from scrapy.selector import Selector
from datetime import datetime, date from datetime import datetime, date
from elSalvador.items import NoticiasItem from elSalvador.items import NoticiasItem
...@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider): ...@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider):
baseURL = "http://www.elsalvador.com/category/noticias/" baseURL = "http://www.elsalvador.com/category/noticias/"
# sectionList = [] # sectionList = []
sectionList = ["nacional"] sectionList = ["internacional"]
# if self.stopDate is None: if self.stopDate is None:
# for s in sectionList: for s in sectionList:
# info = ImportantData() info = ImportantData()
# info['page'] = 1 info['page'] = 1
# request = scrapy.Request(url=baseURL + s + "/", callback=self.parse) request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
# request.meta['item'] = info request.meta['item'] = info
# yield request yield request
#
# else: else:
# for s in sectionList: for s in sectionList:
# info = ImportantData() info = ImportantData()
# info['page'] = 1 info['page'] = 0
# info['CONTINUE_SEARCHING'] = False info['CONTINUE_SEARCHING'] = False
# request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date) request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
# request.meta['item'] = info request.meta['item'] = info
# yield request yield request
for s in sectionList: # for s in sectionList:
yield scrapy.Request(url=baseURL + s + "/", callback=self.parse) # yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
def parse(self, response): def parse(self, response):
...@@ -109,82 +110,132 @@ class QuotesSpider(scrapy.Spider): ...@@ -109,82 +110,132 @@ class QuotesSpider(scrapy.Spider):
# yield request # yield request
linkList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract() linkList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
linkList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract()) linkList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
# for link in linkList: for link in linkList:
# print link
url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion = "/category/noticias/nacional/"
frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "nacional", 'paged': "1", 'category_name': "Nacional", 'url_peticion': url_peticion}
yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
def after_post(self, response):
# from scrapy.shell import inspect_response
import ast
from scrapy.selector import Selector
print "This is response: "
unescaped = ast.literal_eval(response.body.strip())
body = Selector(text=unescaped)
# inspect_response(response, self)
newsList = []
for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
link = link.replace('\\', '')
if not link in newsList:
newsList.append(link)
for link in newsList:
print link print link
# url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
# url_peticion = "/category/noticias/internacional/"
# frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': "4526", 'category_name': "Internacional", 'url_peticion': url_peticion}
#
# yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
# def after_post(self, response):
# searchData = response.meta['item']
# # from scrapy.shell import inspect_response
# # print "This is response: "
# unescaped = ast.literal_eval(response.body.strip())
# body = Selector(text=unescaped)
# # inspect_response(response, self)
# newsList = []
# linksObtained = body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract()
# for link in linksObtained:
# link = link.replace('\\', '')
# if not link in newsList:
# newsList.append(link)
#
# # print len(newsList) checar length de newList para determinar el paro
# if len(newsList) > 0:
# for link in newsList:
# info = ImportantData()
# info['url'] = searchData['url']
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
def parse_with_stop_date(self, response): def parse_with_stop_date(self, response):
searchData = response.meta['item'] searchData = response.meta['item']
CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING'] CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
if not CONTINUE_SEARCHING: if not CONTINUE_SEARCHING:
if searchData['page'] == 1: if searchData['page'] == 0:
searchData['section_url'] = response.url searchData['section_url'] = response.url
linkList = response.xpath('//article[@id="destacada"]/a/@href').extract() newsList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract()) # newsList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
linkList.remove(searchData['section_url'])
else: else:
linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract() unescaped = ast.literal_eval(response.body.strip())
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract()) body = Selector(text=unescaped)
try:
linkList.remove(searchData['section_url']) newsList = []
except KeyError: for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
pass link = link.replace('\\', '')
if not link in newsList:
newsList = [] newsList.append(link)
for link in linkList:
if not link in newsList: if len(newsList) > 0:
newsList.append(link) for link in newsList:
info = ImportantData()
for link in newsList: # info['url'] = response.url
info = ImportantData() info['page'] = searchData['page']
info['url'] = response.url info['section_url'] = searchData['section_url']
info['page'] = searchData['page'] if link == newsList[-1]: info['LAST_LINK'] = True
info['section_url'] = searchData['section_url'] else: info['LAST_LINK'] = False
if link == linkList[-1]: info['LAST_LINK'] = True reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
else: info['LAST_LINK'] = False reqst.meta['item'] = info
reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date) yield reqst
reqst.meta['item'] = info
yield reqst # if searchData['page'] == 1:
# searchData['section_url'] = response.url
# linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
# linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
# linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
# linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
# linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# linkList.remove(searchData['section_url'])
#
# else:
# linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# try:
# linkList.remove(searchData['section_url'])
# except KeyError:
# pass
#
# newsList = []
# for link in linkList:
# if not link in newsList:
# newsList.append(link)
#
# for link in newsList:
# info = ImportantData()
# info['url'] = response.url
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
else: else:
searchData['CONTINUE_SEARCHING'] = False searchData['CONTINUE_SEARCHING'] = False
searchData['page'] += 1 searchData['page'] += 1
page = searchData['page'] page = str(searchData['page'])
url = searchData['section_url']
request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date) url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion = "/category/noticias/internacional/"
frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': page,
'category_name': "Internacional", 'url_peticion': url_peticion}
request = scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.parse_with_stop_date)
request.meta['item'] = searchData request.meta['item'] = searchData
yield request yield request
# searchData['CONTINUE_SEARCHING'] = False
# searchData['page'] += 1
# page = searchData['page']
# url = searchData['section_url']
# request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
# request.meta['item'] = searchData
# yield request
def parse_item(self, response): def parse_item(self, response):
item = NoticiasItem() item = NoticiasItem()
...@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider): ...@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item_with_stop_date(self, response): def parse_item_with_stop_date(self, response):
d = response.xpath('//time/text()').extract_first() d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
dt = datetime.strptime(d, '%d.%m.%Y').date() dt = datetime.strptime(d[:10], '%Y-%m-%d').date()
if dt >= self.stopDate: if dt >= self.stopDate:
info = response.meta['item'] info = response.meta['item']
...@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider):
if info['LAST_LINK']: if info['LAST_LINK']:
info['CONTINUE_SEARCHING'] = True info['CONTINUE_SEARCHING'] = True
request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True) request = scrapy.Request(url=info['section_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = info request.meta['item'] = info
yield request yield request
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment