Commit 57f362cd authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

el sur

parent eac90030
...@@ -9,12 +9,12 @@ import scrapy ...@@ -9,12 +9,12 @@ import scrapy
class NoticiasItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
title = scrapy.Field() title = scrapy.Field()
text = scrapy.Field() text = scrapy.Field()
date = scrapy.Field() date = scrapy.Field()
location = scrapy.Field() location = scrapy.Field()
author = scrapy.Field() author = scrapy.Field()
topic = scrapy.Field() topic = scrapy.Field()
url = scrapy.Field() url = scrapy.Field()
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
# https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
FEED_EXPORT_ENCODING = 'utf-8'
BOT_NAME = 'elSur' BOT_NAME = 'elSur'
SPIDER_MODULES = ['elSur.spiders'] SPIDER_MODULES = ['elSur.spiders']
......
"""
MEDIA:
El Sur, Guerrero
USAGE:
## Get all the news from a specific date. ##
---------------------------------------------------------------------------------------------
$ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
"""
import scrapy, re
from elSur.items import NoticiasItem
#TAG_RE = re.compile(r'<[^>]+>')
#HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
#HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
def remove_tags(text):
return TAG_RE.sub('', text)
class QuotesSpider(scrapy.Spider):
"""
Basic Scrapy Spider class
"""
name = "noticias"
def start_requests(self):
year = getattr(self, "year", None)
month = getattr(self, "month", None)
day = getattr(self, "day", None)
self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
print(self.baseURL)
yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
print(response)
## for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract():
for link in set(response.css('#post-list').css("li > div > div > a::attr(href)").extract():
print(link)
yield scrapy.Request(url=link, callback=self.parse_item)
# pag_lst = response.css('div.dslc-pagination > ul > li')
# if len(pag_lst) > 0:
# del pag_lst[0]
# del pag_lst[0]
# next_page = None
# for li_obj in pag_lst:
# li = remove_tags(li_obj.extract())
# if not li.isdigit():
# next_page = li_obj.xpath('./a/@href').extract_first()
# break
#
# if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse)
def parse_item(self, response):
print(response)
item = NoticiasItem()
# text = ''
# news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
# title = response.css('div.dslc-tp-title > h1').extract_first()
# if title is not None : title = remove_tags(title)
# topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first()
# if topic is not None : topic = remove_tags(topic)
# for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
# p = remove_tags(p)
# text += p + "\n"
# dateline = response.css('span.dateline').extract_first()
# if dateline is not None:
# dateline = remove_tags(dateline)
# text = text.replace(dateline, '')
# text = text.replace(u'\u00a0', ' ')
# text = HEAD_RE_1.sub('', text)
# text = HEAD_RE_2.sub('', text)
# ## News item info ##
# item['date'] = news_date
# item['title'] = title
# item['topic'] = topic
# item['text'] = text.strip()
# item['url'] = response.url
yield item
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
MEDIA: MEDIA:
El Sur, Guerrero El Sur, Guerrero
USAGE: USAGE:
## Get all the news from a specific date. ## ## Get all the news from a specific date. ##
--------------------------------------------------------------------------------------------- ---------------------------------------------------------------------------------------------
$ cd elSur/ $ cd elSur/
$ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5 $ scrapy crawl noticias --nolog -s filename=2018-09-05.json -a year=2018 -a month=9 -a day=5
""" """
import scrapy, re import scrapy, re
from elSur.items import NoticiasItem from elSur.items import NoticiasItem
#from dateutil.parser import parse
from dateparser import parse
TAG_RE = re.compile(r'<[^>]+>') from datetime import datetime
def remove_tags(text):
return TAG_RE.sub('', text)
HEAD_RE_1 = re.compile(r'Texto:.*?[/y] Foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
HEAD_RE_2 = re.compile(r'Texto y foto:.*?\n(.*?[0-9]{1,2} de [a-zA-Z]*? de [0-9]{4}\.)?')
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
""" """
Basic Scrapy Spider class Basic Scrapy Spider class
""" """
name = "noticias" name = "noticias"
def start_requests(self): def start_requests(self):
year = getattr(self, "year", None) year = getattr(self, "year", None)
month = getattr(self, "month", None) month = getattr(self, "month", None)
day = getattr(self, "day", None) day = getattr(self, "day", None)
baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2)) self.baseURL = "https://suracapulco.mx/{0}/{1}/{2}/".format(year, month.zfill(2), day.zfill(2))
print(self.baseURL)
yield scrapy.Request(url=baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
def parse(self, response):
print(response)
def parse(self, response):
for link in response.css('div.dslc-blog-posts').css('div.dslc-blog-post-title > h2 > a::attr(href)').extract(): for link in response.css('#post-list').css("li > div > div > a::attr(href)").extract():
yield scrapy.Request(url=link, callback=self.parse_item) print(link)
yield scrapy.Request(url=link, callback=self.parse_item)
pag_lst = response.css('div.dslc-pagination > ul > li')
if len(pag_lst) > 0: if len(response.css(".paging-navigation > a::attr(href)").extract())>0:
del pag_lst[0] next_page = response.css(".paging-navigation > a::attr(href)").extract()[0]
del pag_lst[0] print(next_page)
next_page = None if next_page is not None: yield scrapy.Request(url=next_page, callback=self.parse)
for li_obj in pag_lst: def parse_item(self, response):
li = remove_tags(li_obj.extract()) print(response.encoding)
if not li.isdigit(): date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first()
next_page = li_obj.xpath('./a/@href').extract_first() title = response.xpath('//meta[@property="og:title"]/@content').extract_first().lower()
break topic = response.xpath('//meta[@property="article:section"]/@content').extract_first().lower()
text = ""
if next_page is not None : yield scrapy.Request(url=next_page, callback=self.parse) for p in response.css("div.xt-post-content > p::text").extract():
text+= p.replace("\n", "")+"\n"
item = NoticiasItem()
def parse_item(self, response): item['date'] = datetime.fromtimestamp(int(date)).isoformat()
item = NoticiasItem() item['title'] = title
text = '' item['topic'] = topic
item['text'] = text
news_date = response.xpath('//meta[@property="article:published_time"]/@content').extract_first() item['url'] = response.url
if item["text"] !="":
title = response.css('div.dslc-tp-title > h1').extract_first() print(item)
if title is not None : title = remove_tags(title) yield item
else:
topic = response.css('div.dslc-tp-meta').xpath('./ul/li[3]/a[1]').extract_first() yield
if topic is not None : topic = remove_tags(topic)
for p in response.xpath('//div[@id="dslc-theme-content-inner"]').css('p').extract():
p = remove_tags(p)
text += p + "\n"
dateline = response.css('span.dateline').extract_first()
if dateline is not None:
dateline = remove_tags(dateline)
text = text.replace(dateline, '')
text = text.replace(u'\u00a0', ' ')
text = HEAD_RE_1.sub('', text)
text = HEAD_RE_2.sub('', text)
## News item info ##
item['date'] = news_date
item['title'] = title
item['topic'] = topic
item['text'] = text.strip()
item['url'] = response.url
yield item
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment