Commit bf32b4c9 authored by Renán Sosa Guillen's avatar Renán Sosa Guillen

crawl rss

parent 20632ff9
...@@ -10,7 +10,6 @@ today = datetime.datetime.now() ...@@ -10,7 +10,6 @@ today = datetime.datetime.now()
baseDir = "/home/geoint/virtualHDD/m3/noticias/" baseDir = "/home/geoint/virtualHDD/m3/noticias/"
scrapyDir = "/home/geoint/crawlersNoticias/" scrapyDir = "/home/geoint/crawlersNoticias/"
row = {} row = {}
urlSet = set()
with open(sys.argv[1]) as data_file: with open(sys.argv[1]) as data_file:
siteList = json.load(data_file) siteList = json.load(data_file)
os.chdir(baseDir) os.chdir(baseDir)
...@@ -77,15 +76,14 @@ with open(sys.argv[1]) as data_file: ...@@ -77,15 +76,14 @@ with open(sys.argv[1]) as data_file:
f1 = mydir + '/' + filename f1 = mydir + '/' + filename
f2 = filename f2 = filename
f3 = baseDir + media + '/' + filename f3 = baseDir + media + '/' + filename
with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3: try:
counter = 0 with open(f1) as infile1, open(f2) as infile2, open(f3, 'a') as infile3:
slave = json.load(infile2)
infile3.write('[')
try:
master = json.load(infile1) master = json.load(infile1)
slave = json.load(infile2)
urlSet = set([line['url'] for line in master]) urlSet = set([line['url'] for line in master])
counter = 0
infile3.write('[')
for line in master: for line in master:
counter += 1 counter += 1
...@@ -114,46 +112,42 @@ with open(sys.argv[1]) as data_file: ...@@ -114,46 +112,42 @@ with open(sys.argv[1]) as data_file:
infile3.write(json.dumps(row)) infile3.write(json.dumps(row))
elif counter > 1: elif counter > 1:
infile3.write(',\n' + json.dumps(row)) infile3.write(',\n' + json.dumps(row))
except IOError:
pass
for line in slave:
if not line['url'] in urlSet:
lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
if lineDate == currentDate:
counter += 1
if media == 'elFinanciero': for line in slave:
row = OrderedDict([ if not line['url'] in urlSet:
('date', line['date']), lineDate = datetime.datetime.strptime(line['date'][:10], '%Y-%m-%d')
('topic', line['topic']), if lineDate == currentDate:
('title', line['title']),
('author', line['author']), if media == 'elFinanciero':
('url', line['url']), row = OrderedDict([
('text', line['text']) ('date', line['date']),
]) ('topic', line['topic']),
elif media == 'elUniversal': ('title', line['title']),
row = OrderedDict([ ('author', line['author']),
('date', line['date']), ('url', line['url']),
('topic', line['topic']), ('text', line['text'])
('title', line['title']), ])
('author', line['author']), elif media == 'elUniversal':
('location', line['location']), row = OrderedDict([
('url', line['url']), ('date', line['date']),
('text', line['text']) ('topic', line['topic']),
]) ('title', line['title']),
('author', line['author']),
('location', line['location']),
('url', line['url']),
('text', line['text'])
])
if counter == 1:
infile3.write(json.dumps(row))
elif counter > 1:
infile3.write(',\n' + json.dumps(row)) infile3.write(',\n' + json.dumps(row))
elif (currentDate - lineDate).days == 1: elif (currentDate - lineDate).days == 1:
YESTERDAY = True YESTERDAY = True
infile3.write(']') infile3.write(']')
os.system("mv " + f3 + " " + mydir) os.system("mv " + f3 + " " + mydir)
# os.system("rm " + f2) # os.system("rm " + f2)
except:
os.system("cp " + f2 + " " + mydir)
if YESTERDAY: if YESTERDAY:
currentDate -= datetime.timedelta(days=1) currentDate -= datetime.timedelta(days=1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment