tracker

38c3e463 · Renán Sosa Guillen · dd06c629 · 38c3e463
Commit 38c3e463 authored Jan 29, 2018 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 64 deletions

tracker.py crawler_script/tracker.py +3 -64

No files found.
--- a/crawler_script/tracker.py
+++ b/crawler_script/tracker.py
@@ -4,7 +4,6 @@ import sys
 import json
 import os
 import datetime
-from collections import OrderedDict

 """
 Descarga las noticias de un sitio desde entre dos fechas especificas.
@@ -12,62 +11,7 @@ USO:
 tracker.py data.json
 """

-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass

-    return row
-
-
-def parse_json(mydir, media, filename):
-    with open(media + "/" + filename) as inputFile, open(filename, "a") as outputFile:
-        jsonFile = json.load(inputFile)
-        counter = 0
-        outputFile.write("[")
-
-        for line in jsonFile:
-            counter += 1
-            auxRow = dictRowGenerator(line)
-            row = OrderedDict(auxRow)
-
-            if counter == 1:
-                outputFile.write(json.dumps(row))
-            elif counter > 1:
-                outputFile.write(",\n" + json.dumps(row))
-
-        outputFile.write("]")
-
-    os.system("mv " + filename + " " + mydir)
-
-
-## INICIO
 # today = datetime.datetime.now()
 baseDir = "/home/geoint/virtualHDD/m3/noticias/"
 scrapyDir = "/home/geoint/crawlersNoticias/"
@@ -122,7 +66,7 @@ with open(sys.argv[1]) as data_file:

            for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if hasta.year!=y else hasta.timetuple().tm_yday)+1):
                filename = currentDate.strftime('%Y-%m-%d')+".json"
-                scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
+                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
                mydir = os.getcwd()
                print mydir
                os.chdir(scrapyDir+s['crawler'])
@@ -130,13 +74,8 @@ with open(sys.argv[1]) as data_file:
                print scrapycommand
                os.system(scrapycommand)
                fileSize = os.stat(filename).st_size
-                if fileSize <= 3:
-                    os.system("rm " + filename)
-                else:
-                    os.chdir("..")
-                    parse_json(mydir, media, filename)
-                    # os.system("mv " + filename + " " + mydir)
-                    os.system("rm " + media + "/" + filename)
+                if fileSize <= 3: os.system("rm " + filename)
+                else: os.system("mv " + filename + " " + mydir)
                os.chdir(mydir)
                currentDate = currentDate + datetime.timedelta(days=1)