crawl all

d835cdfc · Renán Sosa Guillen · 17777858 · d835cdfc
Commit d835cdfc authored Jan 29, 2018 by Renán Sosa Guillen
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 67 deletions

crawl_all.py crawler_script/crawl_all.py +8 -67

No files found.
--- a/crawler_script/crawl_all.py
+++ b/crawler_script/crawl_all.py
@@ -4,68 +4,13 @@ import sys
 import json
 import os
 import datetime
-from collections import OrderedDict
-def dictRowGenerator(line):
-    row = []
-    try:
-        row.append(("date", line['date']))
-    except:
-        pass
-    try:
-        row.append(("topic", line['topic']))
-    except:
-        pass
-    try:
-        row.append(("title", line['title']))
-    except:
-        pass
-    try:
-        row.append(("author", line['author']))
-    except:
-        pass
-    try:
-        row.append(("location", line['location']))
-    except:
-        pass
-    try:
-        row.append(("text", line['text']))
-    except:
-        pass
-    try:
-        row.append(("url", line['url']))
-    except:
-        pass
-    return row
-def parse_json(mydir, media, filename):
-    with open(media + "/" + filename) as inputFile, open(filename, "a") as outputFile:
-        jsonFile = json.load(inputFile)
-        counter = 0
-        outputFile.write("[")
-        for line in jsonFile:
-            counter += 1
-            auxRow = dictRowGenerator(line)
-            row = OrderedDict(auxRow)
-            if counter == 1:
-                outputFile.write(json.dumps(row))
-            elif counter > 1:
-                outputFile.write(",\n" + json.dumps(row))
-        outputFile.write("]")
-    os.system("mv " + filename + " " + mydir)
-## INICIO
 today = datetime.datetime.now()
-baseDir = "/home/geoint/virtualHDD/m3/noticias/"
+# baseDir = "/home/geoint/virtualHDD/m3/noticias/"
-scrapyDir = "/home/geoint/crawlersNoticias/"
+# scrapyDir = "/home/geoint/crawlersNoticias/"
+baseDir = "/home/cna_service/prueba/"
+scrapyDir = "/home/cna_service/crawler/crawlersNoticias/"
 with open(sys.argv[1]) as data_file:    
    siteList = json.load(data_file)
    os.chdir(baseDir)
@@ -115,7 +60,8 @@ with open(sys.argv[1]) as data_file:
            for d in range(day, ((datetime.date(y,12,31)-datetime.date(y,1,1)).days + 1 if today.year!=y else today.timetuple().tm_yday)+1):
                filename = currentDate.strftime('%Y-%m-%d')+".json"
-                scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
+                # scrapycommand = "scrapy crawl noticias -t json --nolog -o " + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
+                scrapycommand = "scrapy crawl noticias --nolog -s filename=" + filename + " -a year="+str(currentDate.year)+ " -a month="+str(currentDate.month)+" -a day="+str(currentDate.day)
                mydir = os.getcwd()
                print mydir
                os.chdir(scrapyDir+s['crawler'])
@@ -123,13 +69,8 @@ with open(sys.argv[1]) as data_file:
                print scrapycommand
                os.system(scrapycommand)
                fileSize = os.stat(filename).st_size
-                if fileSize <= 3:
+                if fileSize <= 3: os.system("rm " + filename)
-                    os.system("rm " + filename)
+                else: os.system("mv " + filename + " " + mydir)
-                else:
-                    os.chdir("..")
-                    parse_json(mydir, media, filename)
-                    # os.system("mv " + filename + " " + mydir)
-                    os.system("rm " + media + "/" + filename)
                os.chdir(mydir)
                currentDate = currentDate + datetime.timedelta(days=1)