Commit 808146d9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawlers

parent ee680ed4
...@@ -14,17 +14,12 @@ def remove_tags(text): ...@@ -14,17 +14,12 @@ def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?')
# G_RE = re.compile(r' ?- ?')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
AUTH_RE = re.compile(r'\nPor.+?\n') AUTH_RE = re.compile(r'\nPor.+?\n')
TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I) TW_RE = re.compile(r'\n((\| )?Twitter:\s+)?@[\w.%+-]+.\n', re.I)
LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I) LOC_RE = re.compile(r'\n.*?\/(PL|AFP|DPA|SIGNIS ALC)\n', re.I)
EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n') EM_RE = re.compile(r'\n((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?\n')
#** correo pasarlo dos veces seguidas
#Frases a quitar: '\nFotografías\n', '\nDiario Co Latino\n'
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -124,19 +119,6 @@ class QuotesSpider(scrapy.Spider): ...@@ -124,19 +119,6 @@ class QuotesSpider(scrapy.Spider):
text = "\n" + text text = "\n" + text
text = text.replace("\nCo Latino\n", '').strip() text = text.replace("\nCo Latino\n", '').strip()
# result = LOC_RE.search(text)
# if result:
# m = result.group(0)
# location = G_RE.sub('', m).strip()
# if len(location) <= 35:
# item['location'] = location
# text = text[text.find(m)+len(m):]
# text = EM_RE.sub('', text)
# text = TW_RE.sub('', text)
# text = TW2_RE.sub('', text)
# text = TAG2_RE.sub("\n", text)
# text = TAG3_RE.sub('', text)
item['text'] = text.strip() item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
......
...@@ -13,13 +13,9 @@ TAG_RE = re.compile(r'<[^>]+>') ...@@ -13,13 +13,9 @@ TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text): def remove_tags(text):
return TAG_RE.sub('', text) return TAG_RE.sub('', text)
# LOC_RE = re.compile(r'\n.+?,? ?.+? ?\. ?- ?') LOC_RE1 = re.compile(r'\n([A-Z]+ )+ ?[.-]')
# G_RE = re.compile(r' ?- ?') LOC_RE2 = re.compile(r'\n.+?,? ?.+? ?(\. ?-|\.|-) ?[A-Z]')
# EM_RE = re.compile(r'((Email|Correo electr.{1,3}nico|Comentarios?):\s)?[\w.-]+@[\w-]+(\.[a-zA-Z]{2,6}){1,2}\s?') SOURCE_RE = re.compile(r'\n ?Fuente:.+$')
# TW_RE = re.compile(r'M.{1,3}s de la P.{1,3}lvora en Twitter: @[\w.%+-]+.', re.I)
# TW2_RE = re.compile(r'((\| )?Twitter:\s+)?(@[\w.%+-]+.)?', re.I)
# TAG2_RE = re.compile(r'\ntransition_[^\]]+\]')
# TAG3_RE = re.compile(r'\[[^\]]+[\]\n]')
class QuotesSpider(scrapy.Spider): class QuotesSpider(scrapy.Spider):
name = "noticias" name = "noticias"
...@@ -76,19 +72,35 @@ class QuotesSpider(scrapy.Spider): ...@@ -76,19 +72,35 @@ class QuotesSpider(scrapy.Spider):
for p in response.xpath('//div[@class="td-post-content"]').css('p').extract(): for p in response.xpath('//div[@class="td-post-content"]').css('p').extract():
text += remove_tags(p) + "\n" text += remove_tags(p) + "\n"
# result = LOC_RE.search(text) text = text.strip()
# if result: text = "\n" + text
# m = result.group(0) text = text.replace(u'\u2013', "-")
# location = G_RE.sub('', m).strip() text = text.replace(u'\u00a0', '') ## Elimina 'no-break spaces'
# if len(location) <= 35:
# item['location'] = location res = LOC_RE1.match(text)
# text = text[text.find(m)+len(m):] if res:
m = res.group(0)[:-1]
# text = EM_RE.sub('', text) location = m.replace("-", '').strip()
# text = TW_RE.sub('', text) if len(location) <= 25:
# text = TW2_RE.sub('', text) item['location'] = location
# text = TAG2_RE.sub("\n", text) text = text.replace(m, '').strip()
# text = TAG3_RE.sub('', text) text = "\n" + text
res = LOC_RE2.match(text)
if res:
m = res.group(0)[:-1]
location = m.replace("-", '').replace(".", '').strip()
if len(location) <= 25:
item['location'] = location
text = text.replace(m, '').strip()
text = "\n" + text
res = SOURCE_RE.search(text)
if res:
m = res.group(0)
text = text.replace(m, '').strip()
text = "\n" + text
item['text'] = text.strip() item['text'] = text.strip()
item['url'] = response.url item['url'] = response.url
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment