Commit 5c3ec077 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawl rss

parent ca85aaa9
...@@ -4,10 +4,12 @@ import sys ...@@ -4,10 +4,12 @@ import sys
import json import json
import os import os
import datetime import datetime
from collections import OrderedDict
today = datetime.datetime.now() today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/" baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/" scrapyDir = "/home/geoint/crawlersNoticias/"
row = {}
with open(sys.argv[1]) as data_file: with open(sys.argv[1]) as data_file:
siteList = json.load(data_file) siteList = json.load(data_file)
os.chdir(baseDir) os.chdir(baseDir)
...@@ -84,16 +86,58 @@ with open(sys.argv[1]) as data_file: ...@@ -84,16 +86,58 @@ with open(sys.argv[1]) as data_file:
for line in master: for line in master:
counter += 1 counter += 1
if media == 'elFinanciero':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
if counter == 1: if counter == 1:
infile3.write(json.dumps(line)) infile3.write(json.dumps(row))
elif counter > 1: elif counter > 1:
infile3.write(',\n' + json.dumps(line)) infile3.write(',\n' + json.dumps(row))
for line in slave: for line in slave:
if not line['url'] in urlSet: if not line['url'] in urlSet:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d') lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
if lineDate == currentDate: if lineDate == currentDate:
infile3.write(',\n' + json.dumps(line))
if media == 'elFinanciero':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
infile3.write(',\n' + json.dumps(row))
elif (currentDate - lineDate).days == 1: elif (currentDate - lineDate).days == 1:
YESTERDAY = True YESTERDAY = True
...@@ -119,15 +163,57 @@ with open(sys.argv[1]) as data_file: ...@@ -119,15 +163,57 @@ with open(sys.argv[1]) as data_file:
for line in master: for line in master:
counter += 1 counter += 1
if media == 'elFinanciero':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
if counter == 1: if counter == 1:
infile3.write(json.dumps(line)) infile3.write(json.dumps(row))
elif counter > 1: elif counter > 1:
infile3.write(',\n' + json.dumps(line)) infile3.write(',\n' + json.dumps(row))
for line in slave: for line in slave:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d') lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
if not line['url'] in urlSet and lineDate == currentDate: if not line['url'] in urlSet and lineDate == currentDate:
infile3.write(',\n' + json.dumps(line))
if media == 'elFinanciero':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('url', line['url']),
('text', line['text'])
])
elif media == 'elUniversal':
row = OrderedDict([
('date', line['date']),
('topic', line['topic']),
('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
infile3.write(',\n' + json.dumps(row))
infile3.write(']') infile3.write(']')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment