Commit 2a9d7bdc authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent d60a1a0a
# -*- coding: utf-8 -*-
import scrapy, re, json
import scrapy, re, json, ast
from scrapy.selector import Selector
from datetime import datetime, date
from elSalvador.items import NoticiasItem
......@@ -48,26 +49,26 @@ class QuotesSpider(scrapy.Spider):
baseURL = "http://www.elsalvador.com/category/noticias/"
# sectionList = []
sectionList = ["nacional"]
sectionList = ["internacional"]
# if self.stopDate is None:
# for s in sectionList:
# info = ImportantData()
# info['page'] = 1
# request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
# request.meta['item'] = info
# yield request
#
# else:
# for s in sectionList:
# info = ImportantData()
# info['page'] = 1
# info['CONTINUE_SEARCHING'] = False
# request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
# request.meta['item'] = info
# yield request
if self.stopDate is None:
for s in sectionList:
yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
info = ImportantData()
info['page'] = 1
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse)
request.meta['item'] = info
yield request
else:
for s in sectionList:
info = ImportantData()
info['page'] = 0
info['CONTINUE_SEARCHING'] = False
request = scrapy.Request(url=baseURL + s + "/", callback=self.parse_with_stop_date)
request.meta['item'] = info
yield request
# for s in sectionList:
# yield scrapy.Request(url=baseURL + s + "/", callback=self.parse)
def parse(self, response):
......@@ -109,32 +110,42 @@ class QuotesSpider(scrapy.Spider):
# yield request
linkList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
linkList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
# for link in linkList:
# print link
url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion = "/category/noticias/nacional/"
frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "nacional", 'paged': "1", 'category_name': "Nacional", 'url_peticion': url_peticion}
yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
for link in linkList:
print link
# url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
# url_peticion = "/category/noticias/internacional/"
# frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': "4526", 'category_name': "Internacional", 'url_peticion': url_peticion}
#
# yield scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.after_post)
def after_post(self, response):
# from scrapy.shell import inspect_response
import ast
from scrapy.selector import Selector
print "This is response: "
unescaped = ast.literal_eval(response.body.strip())
body = Selector(text=unescaped)
# inspect_response(response, self)
newsList = []
for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
link = link.replace('\\', '')
if not link in newsList:
newsList.append(link)
for link in newsList:
print link
# def after_post(self, response):
# searchData = response.meta['item']
# # from scrapy.shell import inspect_response
# # print "This is response: "
# unescaped = ast.literal_eval(response.body.strip())
# body = Selector(text=unescaped)
# # inspect_response(response, self)
# newsList = []
# linksObtained = body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract()
# for link in linksObtained:
# link = link.replace('\\', '')
# if not link in newsList:
# newsList.append(link)
#
# # print len(newsList) checar length de newList para determinar el paro
# if len(newsList) > 0:
# for link in newsList:
# info = ImportantData()
# info['url'] = searchData['url']
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
def parse_with_stop_date(self, response):
......@@ -142,49 +153,89 @@ class QuotesSpider(scrapy.Spider):
CONTINUE_SEARCHING = searchData['CONTINUE_SEARCHING']
if not CONTINUE_SEARCHING:
if searchData['page'] == 1:
if searchData['page'] == 0:
searchData['section_url'] = response.url
linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
linkList.remove(searchData['section_url'])
newsList = response.xpath('//div[@id="main"]').css('h2.large-title').xpath('./a/@href').extract()
# newsList.extend(response.xpath('//div[@class="container even"]').css('h2.large-title').xpath('./a/@href').extract())
else:
linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
try:
linkList.remove(searchData['section_url'])
except KeyError:
pass
unescaped = ast.literal_eval(response.body.strip())
body = Selector(text=unescaped)
newsList = []
for link in linkList:
for link in body.xpath('//div[@class="row news"]').css('div.subsection').css('h2').xpath('./a/@href').extract():
link = link.replace('\\', '')
if not link in newsList:
newsList.append(link)
if len(newsList) > 0:
for link in newsList:
info = ImportantData()
info['url'] = response.url
# info['url'] = response.url
info['page'] = searchData['page']
info['section_url'] = searchData['section_url']
if link == linkList[-1]: info['LAST_LINK'] = True
if link == newsList[-1]: info['LAST_LINK'] = True
else: info['LAST_LINK'] = False
reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
reqst.meta['item'] = info
yield reqst
# if searchData['page'] == 1:
# searchData['section_url'] = response.url
# linkList = response.xpath('//article[@id="destacada"]/a/@href').extract()
# linkList.extend(response.xpath('//aside[@id="mini_sidebar"]/article/a/@href').extract())
# linkList.extend(response.xpath('//section[@id="principal"]/article/a/@href').extract())
# linkList.extend(response.xpath('//aside[@id="otras_noticias"]/article/a/@href').extract())
# linkList.extend(response.xpath('//div[@class="contenedor"]/article/a/@href').extract())
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# linkList.remove(searchData['section_url'])
#
# else:
# linkList = response.xpath('//div[@class="contenedor"]/article/a/@href').extract()
# linkList.extend(response.xpath('//article[@class="nobordes"]/div/a/@href').extract())
# try:
# linkList.remove(searchData['section_url'])
# except KeyError:
# pass
#
# newsList = []
# for link in linkList:
# if not link in newsList:
# newsList.append(link)
#
# for link in newsList:
# info = ImportantData()
# info['url'] = response.url
# info['page'] = searchData['page']
# info['section_url'] = searchData['section_url']
# if link == linkList[-1]: info['LAST_LINK'] = True
# else: info['LAST_LINK'] = False
# reqst = scrapy.Request(url=link, callback=self.parse_item_with_stop_date)
# reqst.meta['item'] = info
# yield reqst
else:
searchData['CONTINUE_SEARCHING'] = False
searchData['page'] += 1
page = searchData['page']
url = searchData['section_url']
request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
page = str(searchData['page'])
url = "http://www.elsalvador.com//wp-json/LoadMore/Category/posts"
url_peticion = "/category/noticias/internacional/"
frmdata = {'pppDestacado': "5", 'pppNoDestacado': "4", 'slug': "internacional", 'paged': page,
'category_name': "Internacional", 'url_peticion': url_peticion}
request = scrapy.http.FormRequest(url=url, formdata=frmdata, callback=self.parse_with_stop_date)
request.meta['item'] = searchData
yield request
# searchData['CONTINUE_SEARCHING'] = False
# searchData['page'] += 1
# page = searchData['page']
# url = searchData['section_url']
# request = scrapy.Request(url=url + "?page=" + str(page), callback=self.parse_with_stop_date)
# request.meta['item'] = searchData
# yield request
def parse_item(self, response):
item = NoticiasItem()
......@@ -217,8 +268,8 @@ class QuotesSpider(scrapy.Spider):
def parse_item_with_stop_date(self, response):
d = response.xpath('//time/text()').extract_first()
dt = datetime.strptime(d, '%d.%m.%Y').date()
d = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
dt = datetime.strptime(d[:10], '%Y-%m-%d').date()
if dt >= self.stopDate:
info = response.meta['item']
......@@ -251,6 +302,6 @@ class QuotesSpider(scrapy.Spider):
if info['LAST_LINK']:
info['CONTINUE_SEARCHING'] = True
request = scrapy.Request(url=info['url'], callback=self.parse_with_stop_date, dont_filter=True)
request = scrapy.Request(url=info['section_url'], callback=self.parse_with_stop_date, dont_filter=True)
request.meta['item'] = info
yield request
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment