Commit ee680ed4 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent f6e99c05
...@@ -55,6 +55,14 @@ class JsonWriterPipeline(object): ...@@ -55,6 +55,14 @@ class JsonWriterPipeline(object):
row.append(("location", item['location'])) row.append(("location", item['location']))
except: except:
pass pass
try:
row.append(("twitter", item['twitter']))
except:
pass
try:
row.append(("email", item['email']))
except:
pass
try: try:
row.append(("text", item['text'])) row.append(("text", item['text']))
except: except:
......
...@@ -20,9 +20,9 @@ def remove_tags(text): ...@@ -20,9 +20,9 @@ def remove_tags(text):
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]') # TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]') # TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
AUTH_RE = re.compile(r'\nPor.+?\n') AUTH_RE = re.compile(r'\nPor.+?\n')
TW_RE = re.compile(r'(\n(\| )?Twitter:\s+)?(@[\w.%+-]+.)', re.I) TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I)
LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA)', re.I) LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'(\n(Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?') EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
#** correo pasarlo dos veces seguidas #** correo pasarlo dos veces seguidas
#Frases a quitar: '\nFotografías\n', '\nDiario Co Latino\n' #Frases a quitar: '\nFotografías\n', '\nDiario Co Latino\n'
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
...@@ -74,7 +74,7 @@ class QuotesSpider(scrapy.Spider): ...@@ -74,7 +74,7 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
""" Obtiene autor """ """ Obtiene autor """
res = AUTH_RE.search(text) res = AUTH_RE.match(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['author'] = m[m.find('Por')+len('Por'):].strip() item['author'] = m[m.find('Por')+len('Por'):].strip()
...@@ -90,30 +90,39 @@ class QuotesSpider(scrapy.Spider): ...@@ -90,30 +90,39 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
""" Obtiene lugar """ """ Obtiene lugar """
res = LOC_RE.search(text) res = LOC_RE.match(text)
if res: if res:
m = res.group(0) m = res.group(0)
if m[m.find('/') + 1:].strip().lower() != 'dpa': if m[m.find('/') + 1:].strip().lower() != 'dpa':
item['location'] = m[:m.find('/')].strip() item['location'] = m[:m.find('/')].strip()
text = text[text.find(m) + len(m):].strip() text = text[text.find(m) + len(m):].strip()
text = "\n" + text text = "\n" + text
else:
text = text[text.find(m) + len(m):].strip()
text = "\n" + text
""" Elimina correo """ """ Elimina correo """
res = EM_RE.search(text) res = EM_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['email'] = m.strip() item['email'] = m.strip()
text = text[text.find(m) + len(m):].strip() # text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text text = "\n" + text
res = EM_RE.search(text) res = EM_RE.search(text)
if res: if res:
m = res.group(0) m = res.group(0)
item['email'] = m.strip() item['email'] = m.strip()
text = text[text.find(m) + len(m):].strip() # text = text[text.find(m) + len(m):].strip()
text = text.replace(m, '').strip()
text = "\n" + text text = "\n" + text
text[text.find("\nDiario Co Latino\n") + len("\nDiario Co Latino\n")] text = text.replace("\n@Diario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nDiario Co Latino\n", '').strip()
text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip()
# result = LOC_RE.search(text) # result = LOC_RE.search(text)
# if result: # if result:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment