-
Notifications
You must be signed in to change notification settings - Fork 0
/
DBFeeder.py
144 lines (110 loc) · 4.95 KB
/
DBFeeder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from DatasetReader import DatasetReader
from dbConnector.M20PSQLConnector import *
import getopt
def printUsage():
print "Usage --help | --init | --initclear | --dropall | [--fraction=FRACTION] [--datasets=PATH] --psql_user=USER --psql_password=PASSWORD"
print "--help show this message"
print "--init to init tables an populate with first level tables"
print "--initclear to init tables an populate with first level tables"
print "--dropall to drop all the tables and reset the system"
print ""
print "--datasets=PATH path of csv dataset files. Default = datasets/data/"
print "--fraction=FRACTION percentage of the dataset to read. Default = 0.05"
print "--psql_user=USER postgres DB user"
print "--psql_password=PASSWORD portegres DB password"
if __name__ == '__main__':
fractionToRead = 0.05
dataset_path = 'datasets/data/'
initDB = False
init_clear = False
dropAll = False
psql_user = ""
psql_pass = ""
#dropAll = True
#initDB = True
print "ARGV: ", sys.argv[1:]
opts, rem = getopt.getopt(sys.argv[1:], "", ['help',
'init',
'initclear',
'dropall',
'fraction=',
'datasets=',
'psql_user=',
'psql_password='])
print "OPTIONS: ", opts
for opt, arg in opts:
if opt == '--help':
printUsage()
sys.exit(0)
if opt == '--init':
initDB = True
if opt == '--initclear':
initDB = True
init_clear = True
if opt == '--dropall':
dropAll = True
if opt == '--fraction':
fractionToRead = float(arg)
if opt == '--datasets':
dataset_path = arg
if opt == '--psql_user':
psql_user = arg
if opt == '--psql_password':
psql_pass = arg
print "init DB ", str(initDB)
print "init clear ", str(init_clear)
print "dropall ", str(dropAll)
print "fraction ", str(fractionToRead)
print "dataset path ", str(dataset_path)
print "psql user ", str(psql_user)
print "psql pass ", str(psql_pass)
#m20Connector = M20PSQLConnector('data_reply_db', 'dario', 'localhost', 'password')
#m20Connector = M20PSQLConnector('postgres', 'cloudera-scm', 'localhost', '7432', 'y6jOvCiNAz')
m20Connector = M20PSQLConnector('postgres', psql_user, 'localhost', '7432', psql_pass)
m20Connector.connect()
if (dropAll):
m20Connector.dropAll()
m20Connector.close()
sys.exit(0)
if(initDB):
"""
Create tables
insert first level tables = movies, genome_tags
"""
m20Connector.initDB()
# Init to read
moviesDS = DatasetReader.initWithFraction(dataset_path + '/movies.csv', 1.0, ',', init=True)
gtagsDS = DatasetReader.initWithFraction(dataset_path + '/genome-tags.csv', 1.0, ',', init=True)
linksDS = DatasetReader.initWithFraction(dataset_path + '/links.csv', 1.0, ',', init=True)
#Just init
ratingsDS = DatasetReader(dataset_path + "/ratings.csv", init=True)
tagsDS = DatasetReader(dataset_path + "/tags.csv", init=True)
gscoresDS = DatasetReader(dataset_path + "/genome-scores.csv", init=True)
if(init_clear == False):
for movie in moviesDS.readPercentage():
# print str(movie)
m20Connector.insert(M20Movie(movie['movieId'], movie['title'], movie['genres']))
for tag in gtagsDS.readPercentage():
# print str(tag)
m20Connector.insert(M20GenomeTag(tag['tagId'], tag['tag']))
for link in linksDS.readPercentage():
#print str(link)
m20Connector.insert(M20Link(link['movieId'], link['imdbId'], link['tmdbId']))
else:
print "Load " + str(fractionToRead * 100) + "% of each Dataset into the Database"
ratingsDS = DatasetReader.initWithFraction(dataset_path + "/ratings.csv", fractionToRead, ',')
print "ratings loaded"
for rat in ratingsDS.readPercentage():
#print str(rat)
m20Connector.insert(M20Rating(rat['userId'], rat['movieId'], rat['rating'], rat['timestamp']))
tagsDS = DatasetReader.initWithFraction(dataset_path + "/tags.csv", fractionToRead, ',')
print "tags loaded"
for tag in tagsDS.readPercentage():
#print str(tag)
m20Connector.insert(M20Tag(tag['userId'], tag['movieId'], tag['tag'], tag['timestamp']))
#gscoresDS = DatasetReader.initWithFraction(dataset_path + "/genome-scores.csv", fractionToRead, ',')
#print "gscores loaded"
#for score in gscoresDS.readPercentage():
# #print str(score)
# m20Connector.insert(M20GenomeScore(score['movieId'], score['tagId'], score['relevance']))
m20Connector.close()