Commit fed7af4d authored by Mario Chirinos Colunga's avatar Mario Chirinos Colunga 💬

new db

parent 13ce817c
{
"model":"/home/mario/git/Guns_N_Roses/utils/BayesClassifier_pos_1000_neg_1250_acc_0.92.pkl",
"documents":"/home/mario/git/Guns_N_Roses/utils/ids_vistos_primera_anot",
"stopwords":"/home/mario/git/Guns_N_Roses/utils/stopwords",
"stop": 500
}
...@@ -28,6 +28,9 @@ class Command(BaseCommand): ...@@ -28,6 +28,9 @@ class Command(BaseCommand):
categories = dict() categories = dict()
doclist = list() doclist = list()
print(newsList.count())
print(p.num_pages)
categories ={"roses":0, "guns":0}
for i in range (1,p.num_pages): for i in range (1,p.num_pages):
# print(i) # print(i)
docs_new = [d for d in p.page(i) if d.text!=""] docs_new = [d for d in p.page(i) if d.text!=""]
...@@ -41,13 +44,13 @@ class Command(BaseCommand): ...@@ -41,13 +44,13 @@ class Command(BaseCommand):
item['text'] = document item['text'] = document
item['id'] = docs_id[ii] item['id'] = docs_id[ii]
doclist.append(item) doclist.append(item)
if category not in categories: # if category not in categories:
categories[category] = 0; # categories[category] = 0;
categories[category] +=1 categories[category] +=1
ii+=1 ii+=1
finish = True finish = True
for c in categories: for c in categories:
finish = finish and categories[c] >=1000 finish = finish and categories[c] >= 5000
if finish: if finish:
break break
...@@ -59,7 +62,7 @@ class Command(BaseCommand): ...@@ -59,7 +62,7 @@ class Command(BaseCommand):
print(categories) print(categories)
with open('data.json', 'w', encoding='utf8') as outfile: with open('data.json', 'w', encoding='utf8') as outfile:
s = json.dumps(doclist, ensure_ascii=False, indent=2) s = json.dumps(doclist, ensure_ascii=False, indent=2)
print (s) # print (s)
outfile.write(s) outfile.write(s)
......
from django.core.management.base import BaseCommand, CommandError
from catalog.models import User, News, Publisher, Topic, audioTime, Search
from django.db.models import Q
import operator
from django.core.paginator import Paginator
import os
#import simplejson as json
import json
from django.db.models import Q
import sys
sys.path.append("/home/mario/git/")
from Guns_N_Roses import M3GunsNRoses
import datetime
#from os import path
import codecs
import nltk
from nltk.stem import SnowballStemmer
from textblob.classifiers import (NaiveBayesClassifier)
import dill as pickle
class Command(BaseCommand):
help = 'Report database'
def add_arguments(self, parser):
# parser.add_argument('model', nargs=1, type=str) #(options['basedir'][0]
parser.add_argument('cfg', nargs=1, type=str)
def handle(self, *args, **options):
cfg = None
with open(options['cfg'][0]) as f:
cfg = json.load(f)
print(cfg['model'])
clf = M3GunsNRoses.load_model(cfg["model"])
stopwords = M3GunsNRoses.file2list(cfg["stopwords"])
documentsOut = M3GunsNRoses.file2list(cfg["documents"])
stemmer = SnowballStemmer('spanish')
pub=[3, 46, 39, 43, 45, 41, 40, 6, 7, 8, 27, 9, 10, 12, 13, 14, 15, 17, 31, 21, 24]
publishers = Publisher.objects.all().filter(id__in=pub)
myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & ~Q(id__in=documentsOut)
newsList = News.objects.all().filter(myQuery).order_by('id')
p = Paginator(newsList, 50)
print("Classifing...")
categories = dict()
doclist = list()
categories = {"guns":0, "roses":0}
for i in range (1,p.num_pages):
docs_new = [d for d in p.page(i) if d.text!=""]
for doc in docs_new:
text = doc.text.replace("\xa0", " ").replace("\x93", "").replace("\x94", "").replace('"', '').replace("'","")
dist_classes = M3GunsNRoses.NBanalyse(clf, text, stopwords)
item = dict()
item['category'] = max(dist_classes.items(), key=operator.itemgetter(1))[0]
item['probabilities'] = dist_classes
item['text'] = text
item['id'] = doc.id
print(dist_classes)
doclist.append(item)
category=item['category']
categories[category] +=1
print(dist_classes)
if categories["guns"] >= cfg["stop"]:
break
# for l in doclist:
# print(l)
# print
# print(categories)
with open('data.json', 'w', encoding='utf8') as outfile:
s = json.dumps(doclist, ensure_ascii=False, indent=2)
print (s)
outfile.write(s)
from django.core.management.base import BaseCommand, CommandError
from catalog.models import User, News, Publisher, Topic, audioTime, Search
from django.db.models import Q
import operator
from django.core.paginator import Paginator
import os
#import simplejson as json
import json
from django.db.models import Q
import sys
sys.path.append("/home/mario/git/")
from Guns_N_Roses import M3GunsNRoses
import datetime
#from os import path
import codecs
import nltk
from nltk.stem import SnowballStemmer
from textblob.classifiers import (NaiveBayesClassifier)
import dill as pickle
#-------------------------------------------------------------------------------------------------
def file2list(fname):
with open(fname) as f:
lines = f.readlines()
lines = [l.strip('\n') for l in lines if not l is '']
return lines
#-------------------------------------------------------------------------------------------------
class Command(BaseCommand):
help = 'Report database'
def add_arguments(self, parser):
# parser.add_argument('model', nargs=1, type=str) #(options['basedir'][0]
parser.add_argument('words', nargs=1, type=str)
def handle(self, *args, **options):
wordList = file2list(options['words'][0])
print(list)
#with open(options['word'][0]) as f:
# cfg = json.load(f)
pub=[3, 46, 39, 43, 45, 41, 40, 6, 7, 8, 27, 9, 10, 12, 13, 14, 15, 17, 31, 21, 24]
publishers = Publisher.objects.all().filter(id__in=pub)
# myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & Q(search_vector__in=wordList)
# newsList = News.objects.all().filter(myQuery).order_by('id')
# print(newsList.count())
ids = set()
for w in wordList:
myQuery = Q(publisher__in=publishers) & Q(date__gte=datetime.date(2014, 1, 1)) & Q(search_vector=w)
newsList = News.objects.all().filter(myQuery)
for i in newsList:
ids.add(i.id)
myQuery = Q(id__in=ids)
newsList = News.objects.all().filter(myQuery)
print("end")
print(newsList.count())
doclist = [{"text":n.text, "id":n.id, "date":n.date.strftime("%Y-%m-%d"), "url":n.url} for n in newsList]
with open('dataNews.json', 'w', encoding='utf8') as outfile:
s = json.dumps(doclist, ensure_ascii=False, indent=2)
# print (s)
outfile.write(s)
...@@ -8,10 +8,18 @@ class Command(BaseCommand): ...@@ -8,10 +8,18 @@ class Command(BaseCommand):
help = 'Report database' help = 'Report database'
# def add_arguments(self, parser): def add_arguments(self, parser):
# parser.add_argument('basedir', nargs=1, type=str) #(options['basedir'][0] parser.add_argument('json', nargs=1, type=str) #(options['basedir'][0]
def handle(self, *args, **options): def handle(self, *args, **options):
data = dict()
data['users'] = User.objects.all().count()
data['publishers'] = Publisher.objects.all().count()
data['text'] = Publisher.objects.all().filter(type="texto").count()
data['audio'] = Publisher.objects.all().filter(type="audio").count()
data['documents'] = News.objects.all().count()
data['searches'] = Search.objects.all().count()
print ( "Users: " + str(User.objects.all().count()) ) print ( "Users: " + str(User.objects.all().count()) )
print ( "Publishers: " + str(Publisher.objects.all().count()) ) print ( "Publishers: " + str(Publisher.objects.all().count()) )
print ( "\tText: " + str(Publisher.objects.all().filter(type="texto").count()) ) print ( "\tText: " + str(Publisher.objects.all().filter(type="texto").count()) )
...@@ -36,3 +44,6 @@ class Command(BaseCommand): ...@@ -36,3 +44,6 @@ class Command(BaseCommand):
audioSources = Publisher.objects.all().filter(type="audio") audioSources = Publisher.objects.all().filter(type="audio")
for a in audioSources: for a in audioSources:
print(a.name + ", " + a.url + ", " + str(audioTime.objects.all().filter(publisher=a)[0].minutes )) print(a.name + ", " + a.url + ", " + str(audioTime.objects.all().filter(publisher=a)[0].minutes ))
print(data)
with open(options['json'][0], 'w') as outfile:
json.dump(data, outfile)
...@@ -18,7 +18,7 @@ class Command(BaseCommand): ...@@ -18,7 +18,7 @@ class Command(BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
if True: if False:
#update radio stations recotding time #update radio stations recotding time
print("Recording Time:") print("Recording Time:")
recordingsDir = "/home/mario/virtualHDD/m3/recordings/" recordingsDir = "/home/mario/virtualHDD/m3/recordings/"
...@@ -50,14 +50,14 @@ class Command(BaseCommand): ...@@ -50,14 +50,14 @@ class Command(BaseCommand):
if news.count()>0: if news.count()>0:
minYear = news[0].date.year minYear = news[0].date.year
lastDate = news[0].date lastDate = news[0].date
yearList = [ int(y) for y in os.listdir('.')] yearList = sorted([ int(y) for y in os.listdir('.')])
print (yearList) print (yearList)
for y in yearList: for y in sorted(yearList):
if y >=minYear: if y >=minYear:
os.chdir(str(y)) os.chdir(str(y))
print (os.getcwd()) print (os.getcwd())
filesList = os.listdir(".") filesList = sorted(os.listdir("."))
for f in filesList: for f in filesList:
fileDate = datetime.datetime.strptime(f[:f.find(".")], "%Y-%m-%d").date() fileDate = datetime.datetime.strptime(f[:f.find(".")], "%Y-%m-%d").date()
......
...@@ -145,7 +145,7 @@ def settingsView(request): ...@@ -145,7 +145,7 @@ def settingsView(request):
else: else:
form = ProfileForm( initial={'subscriptions':[ v for v in request.user.profile.subscriptions.all().values_list('id', flat=True)]}) form = ProfileForm( initial={'subscriptions':[ v for v in request.user.profile.subscriptions.all().values_list('id', flat=True)]})
news = News.objects.all() news = News.objects.all()
publishersList = Publisher.objects.all().filter( Q(id__in = news.values('publisher').distinct())) publishersList = Publisher.objects.filter( Q(id__in = news.values('publisher').distinct()))
choice = [ (r.id,r.name) for r in publishersList ] choice = [ (r.id,r.name) for r in publishersList ]
form.fields['subscriptions'].choices=choice form.fields['subscriptions'].choices=choice
...@@ -184,7 +184,7 @@ def getNewsByRequest(request): ...@@ -184,7 +184,7 @@ def getNewsByRequest(request):
print (myQuery) print (myQuery)
return News.objects.all().filter(myQuery) return News.objects.filter(myQuery)
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
def index(request): def index(request):
...@@ -217,7 +217,7 @@ def index(request): ...@@ -217,7 +217,7 @@ def index(request):
info['nPublishers'] = news.values('publisher').distinct().count() info['nPublishers'] = news.values('publisher').distinct().count()
info['nNews'] = news.count() info['nNews'] = news.count()
info['nNewsText'] = Quantity(news.count()).render(prec=3) info['nNewsText'] = Quantity(news.count()).render(prec=3)
info['nAudio'] = Publisher.objects.all().filter(type="audio").count() info['nAudio'] = Publisher.objects.filter(type="audio").count()
queryset = news.values("publisher").order_by("publisher").annotate(count = Count('publisher') ) queryset = news.values("publisher").order_by("publisher").annotate(count = Count('publisher') )
...@@ -230,7 +230,7 @@ def index(request): ...@@ -230,7 +230,7 @@ def index(request):
donutChart = [{"label": Publisher.objects.get(id=q['publisher']).name, "value":q["count"], "url":urlDict[q['publisher']]} for q in queryset] donutChart = [{"label": Publisher.objects.get(id=q['publisher']).name, "value":q["count"], "url":urlDict[q['publisher']]} for q in queryset]
publishers = Publisher.objects.all().filter( Q(id__in = news.values('publisher').distinct())) publishers = Publisher.objects.filter( Q(id__in = news.values('publisher').distinct()))
queryset = news.values("publisher").annotate(day=TruncMonth('date') ) queryset = news.values("publisher").annotate(day=TruncMonth('date') )
...@@ -254,7 +254,7 @@ def publisherList(request, type="all"): ...@@ -254,7 +254,7 @@ def publisherList(request, type="all"):
news = getNewsByRequest(request) news = getNewsByRequest(request)
publishers = Publisher.objects.all().filter( Q(id__in = news.values('publisher').distinct()) & typeQuery) publishers = Publisher.objects.filter( Q(id__in = news.values('publisher').distinct()) & typeQuery)
for p in publishers: for p in publishers:
...@@ -295,7 +295,7 @@ def wsAudioList(request): ...@@ -295,7 +295,7 @@ def wsAudioList(request):
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
def wsSearchList(request): def wsSearchList(request):
searches = Search.objects.all().filter(user=request.user) searches = Search.objects.filter(user=request.user)
data = dict() data = dict()
data['data']=[[s.text, "*" if s is None else s.startDate.strftime('%Y-%m-%d'), "*" if s is None else s.endDate.strftime('%Y-%m-%d'), ','.join([ sub.shortName for sub in s.publishers.all()])] for s in searches] data['data']=[[s.text, "*" if s is None else s.startDate.strftime('%Y-%m-%d'), "*" if s is None else s.endDate.strftime('%Y-%m-%d'), ','.join([ sub.shortName for sub in s.publishers.all()])] for s in searches]
...@@ -405,7 +405,7 @@ def wsDownloadNews(request): ...@@ -405,7 +405,7 @@ def wsDownloadNews(request):
def audioList(request): def audioList(request):
form = SearchForm(request.GET) form = SearchForm(request.GET)
publishers = audioTime.objects.all().filter(minutes__gt=0 ) #type="audio") publishers = audioTime.objects.filter(minutes__gt=0 ) #type="audio")
return render(request,'audioList.html',{"form":form, "publishers":publishers}) return render(request,'audioList.html',{"form":form, "publishers":publishers})
#------------------------------------------------------------------------------- #-------------------------------------------------------------------------------
...@@ -413,7 +413,7 @@ def audioPublisher(request, publisher): ...@@ -413,7 +413,7 @@ def audioPublisher(request, publisher):
form = SearchForm(request.GET) form = SearchForm(request.GET)
info=dict() info=dict()
if Publisher.objects.all().filter(shortName=publisher).count() > 0: if Publisher.objects.filter(shortName=publisher).count() > 0:
p = Publisher.objects.all().filter(shortName=publisher)[0] p = Publisher.objects.filter(shortName=publisher)[0]
return render(request,'audioPlay.html',{"form":form, "publisher":p}) return render(request,'audioPlay.html',{"form":form, "publisher":p})
...@@ -26,4 +26,5 @@ urlpatterns = [ ...@@ -26,4 +26,5 @@ urlpatterns = [
url(r'^catalog/', include('catalog.urls')), url(r'^catalog/', include('catalog.urls')),
url(r'^$', RedirectView.as_view(url='/catalog/', permanent=True)), url(r'^$', RedirectView.as_view(url='/catalog/', permanent=True)),
url(r'^accounts/', include('django.contrib.auth.urls')), url(r'^accounts/', include('django.contrib.auth.urls')),
# url(r'^ws/reports/images/(?P<type>\w+)$', views.wsReportImages, name='wsReportImages')
] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT) ] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT)
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment