Commit f6e99c05 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent 32d4ecab
......@@ -18,3 +18,5 @@ class NoticiasItem(scrapy.Item):
author = scrapy.Field()
topic = scrapy.Field()
url = scrapy.Field()
twitter = scrapy.Field()
email = scrapy.Field()
......@@ -13,14 +13,18 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
AUTH_RE = re.compile(r'\nPor.+?\n')
TW_RE = re.compile(r'(\n(\| )?Twitter:\s+)?(@[\w.%+-]+.)', re.I)
LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA)', re.I)
EM_RE = re.compile(r'(\n(Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?')
#** correo pasarlo dos veces seguidas
#Frases a quitar: '\nFotografías\n', '\nDiario Co Latino\n'
class QuotesSpider(scrapy.Spider):
name = "noticias"
......@@ -68,6 +72,49 @@ class QuotesSpider(scrapy.Spider):
for p in response.xpath('//div[@class="entry"]/div/span').extract():
text += remove_tags(p) + "\n"
text = "\n" + text
""" Obtiene autor """
res = AUTH_RE.search(text)
if res:
m = res.group(0)
item['author'] = m[m.find('Por')+len('Por'):].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina twitter """
res = TW_RE.search(text)
if res:
m = res.group(0)
item['twitter'] = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Obtiene lugar """
res = LOC_RE.search(text)
if res:
m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa':
item['location'] = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina correo """
res = EM_RE.search(text)
if res:
m = res.group(0)
item['email'] = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
res = EM_RE.search(text)
if res:
m = res.group(0)
item['email'] = m.strip()
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
text[text.find("\nDiario Co Latino\n") + len("\nDiario Co Latino\n")]
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment