Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
crawlersNoticias
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
4
Issues
4
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
m3
crawlersNoticias
Commits
cf57bd43
Commit
cf57bd43
authored
7 years ago
by
Renán Sosa Guillen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tracker
parent
6267f8ed
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
149 additions
and
0 deletions
+149
-0
tracker.py
crawler_script/tracker.py
+149
-0
No files found.
crawler_script/tracker.py
0 → 100644
View file @
cf57bd43
#!/usr/bin/python
# -*- coding: utf-8 -*-
import
sys
import
json
import
os
import
datetime
from
collections
import
OrderedDict
"""
Descarga las noticias de un sitio desde entre dos fechas especificas.
USO:
tracker.py data.json
"""
def
dictRowGenerator
(
line
):
row
=
[]
try
:
row
.
append
((
"date"
,
line
[
'date'
]))
except
:
pass
try
:
row
.
append
((
"topic"
,
line
[
'topic'
]))
except
:
pass
try
:
row
.
append
((
"title"
,
line
[
'title'
]))
except
:
pass
try
:
row
.
append
((
"author"
,
line
[
'author'
]))
except
:
pass
try
:
row
.
append
((
"location"
,
line
[
'location'
]))
except
:
pass
try
:
row
.
append
((
"text"
,
line
[
'text'
]))
except
:
pass
try
:
row
.
append
((
"url"
,
line
[
'url'
]))
except
:
pass
return
row
def
parse_json
(
mydir
,
media
,
filename
):
with
open
(
media
+
"/"
+
filename
)
as
inputFile
,
open
(
filename
,
"a"
)
as
outputFile
:
jsonFile
=
json
.
load
(
inputFile
)
counter
=
0
outputFile
.
write
(
"["
)
for
line
in
jsonFile
:
counter
+=
1
auxRow
=
dictRowGenerator
(
line
)
row
=
OrderedDict
(
auxRow
)
if
counter
==
1
:
outputFile
.
write
(
json
.
dumps
(
row
))
elif
counter
>
1
:
outputFile
.
write
(
",
\n
"
+
json
.
dumps
(
row
))
outputFile
.
write
(
"]"
)
os
.
system
(
"mv "
+
filename
+
" "
+
mydir
)
## INICIO
# today = datetime.datetime.now()
# baseDir = "/home/geoint/virtualHDD/m3/noticias/"
# scrapyDir = "/home/geoint/crawlersNoticias/"
baseDir
=
"/home/cna_service/prueba/"
scrapyDir
=
"/home/cna_service/crawler/crawlersNoticias/"
with
open
(
sys
.
argv
[
1
])
as
data_file
:
siteList
=
json
.
load
(
data_file
)
os
.
chdir
(
baseDir
)
for
s
in
siteList
:
desde
=
datetime
.
datetime
.
strptime
(
s
[
'desde'
],
"
%
d-
%
m-
%
Y"
)
hasta
=
datetime
.
datetime
.
strptime
(
s
[
'hasta'
],
"
%
d-
%
m-
%
Y"
)
print
str
(
s
[
'nombre'
]
+
", desde:"
+
desde
.
strftime
(
"
%
Y-
%
m-
%
d"
))
media
=
s
[
'crawler'
][
s
[
'crawler'
]
.
find
(
"/"
)
+
1
:]
try
:
os
.
makedirs
(
media
)
except
:
print
"ok"
os
.
chdir
(
media
)
# lstYears = os.listdir(".")
# lstYears.sort()
year
=
desde
.
year
# if len(lstYears) > 0:
# year = int(lstYears[len(lstYears)-1])
for
y
in
range
(
year
,
hasta
.
year
+
1
):
print
y
try
:
os
.
makedirs
(
str
(
y
))
except
:
print
"ok"
os
.
chdir
(
str
(
y
))
# print os.getcwd()
# lstDays = os.listdir(".")
# lstDays = [l for l in lstDays if not l.startswith('.')]
# lstDays.sort()
# print lstDays
day
=
desde
.
timetuple
()
.
tm_yday
print
day
currentDate
=
desde
# if len(lstDays) > 0:
# strDate = lstDays[len(lstDays)-1]
# strDate = strDate[:strDate.find(".")]
# currentDate = datetime.datetime.strptime(strDate, '%Y-%m-%d')
# day = currentDate.timetuple().tm_yday
# elif y != desde.year:
if
y
!=
desde
.
year
:
currentDate
=
datetime
.
datetime
.
strptime
(
str
(
y
)
+
"-01-01"
,
'
%
Y-
%
m-
%
d'
)
day
=
1
for
d
in
range
(
day
,
((
datetime
.
date
(
y
,
12
,
31
)
-
datetime
.
date
(
y
,
1
,
1
))
.
days
+
1
if
hasta
.
year
!=
y
else
hasta
.
timetuple
()
.
tm_yday
)
+
1
):
filename
=
currentDate
.
strftime
(
'
%
Y-
%
m-
%
d'
)
+
".json"
scrapycommand
=
"scrapy crawl noticias -t json --nolog -o "
+
filename
+
" -a year="
+
str
(
currentDate
.
year
)
+
" -a month="
+
str
(
currentDate
.
month
)
+
" -a day="
+
str
(
currentDate
.
day
)
mydir
=
os
.
getcwd
()
print
mydir
os
.
chdir
(
scrapyDir
+
s
[
'crawler'
])
print
media
print
scrapycommand
os
.
system
(
scrapycommand
)
fileSize
=
os
.
stat
(
filename
)
.
st_size
if
fileSize
<=
3
:
os
.
system
(
"rm "
+
filename
)
else
:
os
.
chdir
(
".."
)
parse_json
(
mydir
,
media
,
filename
)
# os.system("mv " + filename + " " + mydir)
os
.
system
(
"rm "
+
media
+
"/"
+
filename
)
os
.
chdir
(
mydir
)
currentDate
=
currentDate
+
datetime
.
timedelta
(
days
=
1
)
os
.
chdir
(
".."
)
os
.
chdir
(
".."
)
# print hasta.year
# scrapy crawl noticias -t json -o $y-$m-$d.json -a year=$y -a month=$m -a day=$d # ejecucion del crawler correspondiente segun el sitio
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment