Commit 9166c6e7 authored by Mario Chirinos's avatar Mario Chirinos

el valle

parent 42ffe60e
"""
Spider for elvalle.com.mx
Author: Mario Chirinos Coluga
Usage:scrapy crawl noticias --nolog -O 2017-04-23.json -a year=2017 -a month=4 -a day=23
"""
import scrapy
import re
from elValle.items import ElvalleItem
#-------------------------------------------------------------------------------
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
#-------------------------------------------------------------------------------
class NoticiasSpider(scrapy.Spider):
name = 'noticias'
allowed_domains = ['elvalle.com.mx']
start_urls = ['https://elvalle.com.mx/']
#-----------------------------------------------------------------------
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.date = year + "-" + month.zfill(2) + "-" + self.day.zfill(2)
self.baseURL = "https://elvalle.com.mx/" + year + "/" + month.zfill(2) + "/" + self.day.zfill(2) + "/"
yield scrapy.Request(url=self.baseURL, callback=self.parseList)
#-----------------------------------------------------------------------
def parseList(self, response):
print(response.url)
for i in response.xpath('//article//div[@class="post-details"]/a/@href').extract():
yield scrapy.Request(url=i, callback=self.parseItem)
next_page = response.xpath('//ul[@class="pagination clearfix"]/li[last()]/a/@href').extract_first()
print("nextPage", next_page)
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parseList)
#-----------------------------------------------------------------------
def parseItem(self, response):
# print(response.url)
item = ElvalleItem()
text = ''
for p in response.xpath('//article/p/text()').extract():
tt = remove_tags(p)+ "\n"
text+=tt
item['title'] = response.xpath('//h1[@class="entry-title"]/text()').extract_first()
item['date'] = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
item['topic'] = response.xpath('//a[@rel="category tag"]/text()').extract()
item['author'] = response.xpath('//meta[@name="author"]/@content').extract_first()
item['text']=text
item['url']= response.xpath('//link[@rel="canonical"]/@href').extract_first()
print(item['title'])
yield(item)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment