animalPolitico

a096f32c · Ulises Morales Ramírez · 1a13f0d7 · a096f32c · a096f32c
Commit a096f32c authored Jan 13, 2025 by Ulises Morales Ramírez
Hide whitespace changes
Inline Side-by-side

Showing with 37 additions and 34 deletions

crawlAll.py scripts/crawlAll.py +35 -30

noticias.py ...s/backwards/elFinanciero/elFinanciero/spiders/noticias.py +2 -4

No files found.
--- a/scripts/crawlAll.py
+++ b/scripts/crawlAll.py
@@ -4,41 +4,46 @@ import datetime
 import glob
 import json
 import os
-#===============================================================================
 #===============================================================================
 def main(argv):
-	'''
+    '''
+    Main function to process directories and run crawlers.
+    '''
+    cwd = os.getcwd()
+    if len(argv) != 2 and len(argv) != 3:
+        print("Usage: " + argv[0] + " <directory> [endDate:YYYY-MM-DD]")
+    else:
+        rootdir = argv[1]
+        # Ensure the log directory exists
+        if not os.path.exists(rootdir):
+            os.makedirs(rootdir)
+        log_filename = rootdir + "/" + datetime.datetime.today().strftime('%Y-%m-%d') + ".log"
+        with open(log_filename, "a") as logfile:
+            for path in sorted(glob.glob(f'{rootdir}/*/')):
+                print(path)
+                if not os.path.exists(path + 'settings.json'):
+                    logfile.write("\t configuration file <" + path + "settings.json> not found.\n")
+                    continue
+                with open(path + 'settings.json') as json_file:
+                    cfg = json.load(json_file)
+                    print(cfg)
+                if not cfg["enabled"]:
+                    continue
+                logfile.write("Crawler " + cfg["crawler"] + " started at: " +
+                              datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S") + "\n")
-	'''
+                new_cwd = os.getcwd() + "/spiders/" + cfg["crawler"]
-	cwd = os.getcwd()
+                if os.path.exists(new_cwd):
-	if len(argv) != 2 and len(argv) != 3:
+                    os.chdir(new_cwd)
-		print ("Usage: " + argv[0] + "<directory> [endDate:YYYY-MM-DD]")
+                    os.system("python3 ../../../scripts/siteCrawler.py " + path)
-	else:
+                else:
-		rootdir = argv[1]
+                    logfile.write("\t spider not found.\n")
-		with open(rootdir+"/"+datetime.datetime.today().strftime('%Y-%m-%d')+".log", "a") as logfile:
+                print(os.getcwd())
-			for path in sorted(glob.glob(f'{rootdir}/*/')):
+                os.chdir(cwd)
-				print(path)
-				if not os.path.exists(path+'settings.json'):
-					logfile.write("\t configuration file <"+path+"settings.json> not found.\n")
-					continue
-				with open(path+'settings.json') as json_file:
-						cfg = json.load(json_file)
-						print(cfg)
-				if not cfg["enabled"]:
-					continue
-				logfile.write("Crawler "+cfg["crawler"]+" started at: " +datetime.datetime.now().strftime("%Y-%m-%d, %H:%M:%S")+"\n")
-				new_cwd = os.getcwd()+"/spiders/"+cfg["crawler"]
-				if os.path.exists(new_cwd):
-					os.chdir(new_cwd)
-					os.system("python3 ../../../scripts/siteCrawler.py " + path)
-				else:
-					logfile.write("\t spider not found.\n")
-				print(os.getcwd())
-				os.chdir(cwd)
 #-------------------------------------------------------------------------------
 if __name__ == "__main__":
-	main(sys.argv)
+    main(sys.argv)
--- a/spiders/backwards/elFinanciero/elFinanciero/spiders/noticias.py
+++ b/spiders/backwards/elFinanciero/elFinanciero/spiders/noticias.py
@@ -84,14 +84,12 @@ class QuotesSpider(scrapy.Spider):
        self.uri_base = "http://api.elfinanciero.com.mx/public/search/typed/?_format=json&json={%22search%22:%22*%22,%22categoriesslug%22:%22"
        self.uri_page = "%22}&type=page&page="
        self.uri_complement = "&size=10"
+        print(self.uri_base+self.uri_page+self.uri_complement)
-	print(self.uri_base+self.uri_page+self.uri_complement)
        for s in sectionList:
            yield scrapy.Request(url=self.baseURL + s, callback=self.parse)
    def parse(self, response):
-	print(response.url)
+        print(response.url)
        searchData = ImportantData()
        CONTINUE_SEARCHING = True