Commit ea874dde authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 87ed4374
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class DiarioyucatanItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class DiarioyucatanPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -27,7 +27,7 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders' ...@@ -27,7 +27,7 @@ NEWSPIDER_MODULE = 'diarioYucatan.spiders'
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False ...@@ -64,9 +64,9 @@ COOKIES_ENABLED = False
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'diarioYucatan.pipelines.DiarioyucatanPipeline': 300, 'diarioYucatan.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from diarioYucatan.items import NoticiasItem
# from datetime import datetime, date, timedelta # from datetime import datetime, date, timedelta
# from scrapy.spidermiddlewares.httperror import HttpError # from scrapy.spidermiddlewares.httperror import HttpError
""" """
MEDIO:
Diario de Yucatán, Yuc.
Esta version descarga todas las noticias contenidas en la pagina, sin necesidad Esta version descarga todas las noticias contenidas en la pagina, sin necesidad
de una fecha especifica. de una fecha especifica.
USO: USO:
scrapy crawl noticias --nolog -s filename=noticias.json
scrapy crawl noticias -t json --nolog -o noticias.json
""" """
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -46,7 +37,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -46,7 +37,7 @@ class QuotesSpider(scrapy.Spider):
# 'multimedia', 'multimedia/fotos', 'multimedia/videos'] # 'multimedia', 'multimedia/fotos', 'multimedia/videos']
self.globalLinkSet = set() self.globalLinkSet = set()
self.baseURL = 'http://www.yucatan.com.mx/seccion/' self.baseURL = "http://www.yucatan.com.mx/seccion/"
self.parsing_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7, self.parsing_month = {'enero': 1, 'febrero': 2, 'marzo': 3, 'abril': 4, 'mayo': 5, 'junio': 6, 'julio': 7,
'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12} 'agosto': 8, 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class ElfinancieroItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class ElfinancieroPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'elFinanciero.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'elFinanciero.spiders'
#USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)' #USER_AGENT = 'elFinanciero (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'elFinanciero.pipelines.ElfinancieroPipeline': 300, 'elFinanciero.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from elFinanciero.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
''' """
scrapy crawl noticias -t json --nolog -o noticias.json MEDIO:
''' El Financiero, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
...@@ -23,16 +27,6 @@ class UTC(tzinfo): ...@@ -23,16 +27,6 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -42,7 +36,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -42,7 +36,7 @@ class QuotesSpider(scrapy.Spider):
'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8, 'mayo': 5, 'junio': 6, 'julio': 7, 'agosto': 8,
'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12} 'septiembre': 9, 'octubre': 10, 'noviembre': 11, 'diciembre': 12}
self.baseURL = 'http://www.elfinanciero.com.mx/rss' self.baseURL = "http://www.elfinanciero.com.mx/rss"
yield scrapy.Request(url=self.baseURL, callback=self.parse) yield scrapy.Request(url=self.baseURL, callback=self.parse)
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class EluniversalItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class EluniversalPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'elUniversal.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'elUniversal.spiders'
#USER_AGENT = 'elUniversal (+http://www.yourdomain.com)' #USER_AGENT = 'elUniversal (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'elUniversal.pipelines.EluniversalPipeline': 300, 'elUniversal.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from elUniversal.items import NoticiasItem
''' """
scrapy crawl noticias -t json --nolog -o noticias.json MEDIO:
''' El Universal, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2017-12-20.json
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
...@@ -10,16 +15,6 @@ def remove_tags(text): ...@@ -10,16 +15,6 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
...@@ -8,7 +8,13 @@ ...@@ -8,7 +8,13 @@
import scrapy import scrapy
class SoldemexItem(scrapy.Item): class NoticiasItem(scrapy.Item):
# define the fields for your item here like: # define the fields for your item here like:
# name = scrapy.Field() # name = scrapy.Field()
pass title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
...@@ -5,7 +5,71 @@ ...@@ -5,7 +5,71 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
from collections import OrderedDict
class JsonWriterPipeline(object):
def __init__(self, filename):
self.filename = filename
@classmethod
def from_crawler(cls, crawler):
# Here you get whatever value was passed through the "filename" command line parameter
settings = crawler.settings
filename = settings.get('filename')
# Instantiate the pipeline with the file name
return cls(filename)
def open_spider(self, spider):
self.counter = 0
self.file = open(self.filename, 'w')
self.file.write("[")
def close_spider(self, spider):
self.file.write("]")
self.file.close()
class SoldemexPipeline(object):
def process_item(self, item, spider): def process_item(self, item, spider):
# print("this is my item", item)
row = []
try:
row.append(("date", item['date']))
except:
pass
try:
row.append(("topic", item['topic']))
except:
pass
try:
row.append(("title", item['title']))
except:
pass
try:
row.append(("author", item['author']))
except:
pass
try:
row.append(("location", item['location']))
except:
pass
try:
row.append(("text", item['text']))
except:
pass
try:
row.append(("url", item['url']))
except:
pass
line = OrderedDict(row)
self.counter += 1
if self.counter == 1:
self.file.write(json.dumps(line))
elif self.counter > 1:
self.file.write(",\n" + json.dumps(line))
return item return item
...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'solDeMex.spiders' ...@@ -19,7 +19,7 @@ NEWSPIDER_MODULE = 'solDeMex.spiders'
#USER_AGENT = 'solDeMex (+http://www.yourdomain.com)' #USER_AGENT = 'solDeMex (+http://www.yourdomain.com)'
# Obey robots.txt rules # Obey robots.txt rules
ROBOTSTXT_OBEY = True # ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16) # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 #CONCURRENT_REQUESTS = 32
...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True ...@@ -27,13 +27,13 @@ ROBOTSTXT_OBEY = True
# Configure a delay for requests for the same website (default: 0) # Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs # See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3 DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of: # The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 #CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default) # Disable cookies (enabled by default)
#COOKIES_ENABLED = False COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default) # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False #TELNETCONSOLE_ENABLED = False
...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True ...@@ -64,9 +64,9 @@ ROBOTSTXT_OBEY = True
# Configure item pipelines # Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = { ITEM_PIPELINES = {
# 'solDeMex.pipelines.SoldemexPipeline': 300, 'solDeMex.pipelines.JsonWriterPipeline': 300,
#} }
# Enable and configure the AutoThrottle extension (disabled by default) # Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
......
# -*- coding: utf-8 -*-
import scrapy, re import scrapy, re
from solDeMex.items import NoticiasItem
from datetime import datetime, timedelta, tzinfo from datetime import datetime, timedelta, tzinfo
''' """
scrapy crawl noticias -t json --nolog -o noticias.json MEDIO:
''' El Sol de México, CDMX
USO:
scrapy crawl noticias --nolog -s filename=2018-01-20.json
"""
TAG_RE = re.compile(r'<[^>]+>') TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
...@@ -23,16 +27,6 @@ class UTC(tzinfo): ...@@ -23,16 +27,6 @@ class UTC(tzinfo):
return 'UTC-6' return 'UTC-6'
class NoticiasItem(scrapy.Item):
title = scrapy.Field()
text = scrapy.Field()
date = scrapy.Field()
location = scrapy.Field()
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment