Commit 61744f46 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

Crawlers.

parent d66f91ca
...@@ -49,6 +49,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -49,6 +49,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+'page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......
...@@ -37,9 +37,9 @@ class QuotesSpider(scrapy.Spider): ...@@ -37,9 +37,9 @@ class QuotesSpider(scrapy.Spider):
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0,pages): for page in range(0,pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -31,15 +31,15 @@ class QuotesSpider(scrapy.Spider): ...@@ -31,15 +31,15 @@ class QuotesSpider(scrapy.Spider):
yield scrapy.Request(url=url, callback=self.parse) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): def parse(self, response):
pagination = response.xpath('//div[@id="content"]/div[3]/a/@href').extract() pagination = response.xpath('//*[@class="pagination"]/a[@class="page-numbers"]/@href').extract()
if ( len(pagination) > 0 ): if ( len(pagination) > 0 ):
pagination = pagination[-2].strip('/') pagination = pagination[-1].strip('/')
pages = int(pagination[pagination.rfind('/')+1:]) pages = int(pagination[pagination.rfind('/')+1:])
for page in range(0, pages): for page in range(0, pages):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -60,6 +60,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -60,6 +60,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -40,7 +40,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -40,7 +40,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+'/page/'+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+'/page/'+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -58,6 +58,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -58,6 +58,6 @@ class QuotesSpider(scrapy.Spider):
item['text'] = text item['text'] = text
item['topic'] = response.css('div.post-tags').css('a::text').extract() item['topic'] = response.css('div.post-tags').css('a::text').extract()
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
......
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -61,5 +61,5 @@ class QuotesSpider(scrapy.Spider): ...@@ -61,5 +61,5 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
\ No newline at end of file
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -58,6 +58,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -58,6 +58,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider):
for paragraph in response.css('div.td-post-content').css('p').extract(): for paragraph in response.css('div.td-post-content').css('p').extract():
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
print item['title'] # print item['title']
yield item yield item
...@@ -48,5 +48,5 @@ class QuotesSpider(scrapy.Spider): ...@@ -48,5 +48,5 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) text += remove_tags(paragraph)
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -61,6 +61,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -61,6 +61,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
...@@ -67,5 +67,5 @@ class QuotesSpider(scrapy.Spider): ...@@ -67,5 +67,5 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags( paragraph ) + '\n' text += remove_tags( paragraph ) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
\ No newline at end of file
...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -39,7 +39,7 @@ class QuotesSpider(scrapy.Spider):
if ( page == 0 ): if ( page == 0 ):
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
else: else:
yield scrapy.Request(url=response.url+"/page/"+str(page+1), callback=self.parse_page) yield scrapy.Request(url=self.baseURL+"/page/"+str(page+1), callback=self.parse_page)
else: else:
yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True) yield scrapy.Request(url=response.url, callback=self.parse_page, dont_filter=True)
...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -57,6 +57,6 @@ class QuotesSpider(scrapy.Spider):
text += remove_tags(paragraph) + '\n' text += remove_tags(paragraph) + '\n'
item['text'] = text item['text'] = text
item['url'] = response.url item['url'] = response.url
print item['title'] # print item['title']
yield item yield item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment