-
Notifications
You must be signed in to change notification settings - Fork 0
/
schema.sql
288 lines (272 loc) · 17.2 KB
/
schema.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
CREATE TABLE assembly
--- Results from querying NCBI Entrez Assembly database
(
-- Column list order must match order stated in fetch.sh line #39
"uid" TEXT PRIMARY KEY, -- Record Id
"rsuid" TEXT, -- RefSeq Id
"gbuid" TEXT, -- GenBank Id
"assemblyaccession" TEXT,
"lastmajorreleaseaccession" TEXT,
"latestaccession" TEXT,
"chainid" TEXT,
"assemblyname" TEXT,
"ucscname" TEXT,
"ensemblname" TEXT,
"taxid" TEXT,
"organism" TEXT,
"speciestaxid" TEXT,
"speciesname" TEXT,
"assemblytype" TEXT,
"assemblyclass" TEXT,
"assemblystatus" TEXT,
"assemblystatussort" INTEGER,
"wgs" TEXT,
"gb_bioprojects" TEXT, -- JSON encoded list of genbank bioproject objects
"gb_projects" TEXT, -- JSON encoded list of genbank projects
"rs_bioprojects" TEXT, -- JSON encoded list of RefSeq bioproject objects
"rs_projects" TEXT, -- JSON encoded list of RefSeq projects
"biosampleaccn" TEXT,
"biosampleid" TEXT,
-- "biosource" object, -- This structure flattened into biosource_* columns
"biosource_infraspecieslist" TEXT, -- JSON encoded list
"biosource_sex" TEXT,
"biosource_isolate" TEXT,
"coverage" TEXT,
"partialgenomerepresentation" TEXT,
"primary" TEXT,
"assemblydescription" TEXT,
"releaselevel" TEXT,
"releasetype" TEXT,
"asmreleasedate_genbank" TEXT,
"asmreleasedate_refseq" TEXT,
"seqreleasedate" TEXT,
"asmupdatedate" TEXT,
"submissiondate" TEXT,
"lastupdatedate" TEXT,
"submitterorganization" TEXT,
"refseq_category" TEXT,
"anomalouslist" TEXT, -- JSON encoded list
"exclfromrefseq" TEXT, -- JSON encoded list
"propertylist" TEXT, -- COMMA separated list
"fromtype" TEXT,
-- "synonym" object, -- This structure flattened into synonym_* columns
"synonym_genbank" TEXT,
"synonym_refseq" TEXT,
"synonym_similarity" TEXT,
"contign50" INTEGER,
"scaffoldn50" INTEGER,
"ftppath_genbank" TEXT,
"ftppath_refseq" TEXT,
"ftppath_assembly_rpt" TEXT,
"ftppath_stats_rpt" TEXT,
"ftppath_regions_rpt" TEXT,
"sortorder" TEXT
-- "meta" TEXT -- Omitted
) WITHOUT ROWID;
CREATE TABLE summaries
--- Statistics generated from the genomic datasets. One record per replicon.
(
-- Column list order must match order stated in fetch.sh line #170
"id" INTEGER PRIMARY KEY, -- Alias of rowid
"uid" TEXT REFERENCES assembly ("uid") ON DELETE CASCADE NOT NULL, -- Foreign key to assembly table
"seqid" TEXT NOT NULL, -- Replicon Id
"accession" TEXT, -- Replicon accession
"name" TEXT, -- Replicon name
"description" TEXT, -- Replicon description
"type" TEXT, -- Replicon type (chromosome/plasmid)
"molecule_type" TEXT, -- Molecule type [DNA/RNA]
"sequence_version" INTEGER, -- Version of replicon sequence
"gi_number" INTEGER, --
"cds_count" INTEGER, -- # of CDS features
"gene_count" INTEGER, -- # of gene features
"rna_count" INTEGER, -- # of xRNA features
"repeat_region_count" INTEGER, -- # of repeat_region features
"length" INTEGER, -- Length of replicon sequence
"source" TEXT, -- Replicon source
UNIQUE (uid, seqid)
);
CREATE TABLE datasets
--- Listing of all files
(
"uid" TEXT REFERENCES assembly ("uid") ON DELETE CASCADE NOT NULL, -- Foreign key to assembly table
"replicon" INTEGER REFERENCES summaries ("id") ON DELETE CASCADE, -- Foreign key to summary table, NULL when dataset represents all replicons
"path" TEXT NOT NULL UNIQUE, -- Path to dataset relative to the database
"format" TEXT NOT NULL, -- Format of dataset (file extension)
"suffix" TEXT NOT NULL, -- Filename suffix excluding extension, describes file content
"checksum" TEXT, -- MD5 Checksum of dataset (or compressed dataset) as reported by NCBI
"parent" TEXT REFERENCES datasets ("path") ON DELETE CASCADE -- Reference to derivative dataset
);
CREATE TABLE taxonomy_names
--- Names of tax_ids in taxonomy_nodes table. Multiple records exist per tax_id.
(
tax_id INTEGER, -- the id of node associated with this name
name_txt TEXT, -- name itself
unique_name TEXT, -- the unique variant of this name if name not unique
name_class TEXT -- (synonym, common name, ...)
);
CREATE INDEX taxonomy_names_tax_id ON taxonomy_names (tax_id);
CREATE TABLE taxonomy_divisions
--- Taxonomy divisions referenced by taxonomy_nodes table
(
id INTEGER PRIMARY KEY, -- taxonomy database division id
cde TEXT, -- GenBank division code (three characters)
name TEXT, -- e.g. BCT, PLN, VRT, MAM, PRI...
comments TEXT -- free-text comments
);
CREATE TABLE taxonomy_gencode
--- Genbank genetic codes referenced by taxonomy_nodes table
(
id INTEGER PRIMARY KEY, -- GenBank genetic code id
abbreviation TEXT, -- genetic code name abbreviation
name TEXT, -- genetic code name
cde TEXT, -- translation table for this genetic code
starts TEXT -- start codons for this genetic code
);
CREATE TABLE taxonomy_nodes
--- Taxonomic tree of Genbank taxonomy database
(
tax_id INTEGER PRIMARY KEY, -- node id in GenBank taxonomy database
parent_tax_id INTEGER REFERNCES taxonomy_nodes, -- parent node id in GenBank taxonomy database
rank TEXT, -- rank of this node (superkingdom, kingdom, ...)
embl_code TEXT, -- locus-name prefix; not unique
division_id INTEGER REFERENCES taxonomy_divisions (id) ON DELETE CASCADE, -- see taxonomy_divisions table
inherited_div_flag BOOLEAN, -- 1 if node inherits division from parent
genetic_code_id INTEGER REFERENCES taxonomy_gencode (id) ON DELETE CASCADE, -- see taxonomy_gencode table
inherited_GC_flag BOOLEAN, -- 1 if node inherits genetic code from parent
mitochondrial_genetic_code_id INTEGER REFERENCES taxonomy_gencode (id) ON DELETE CASCADE, -- see taxonomy_gencode table
inherited_MGC_flag BOOLEAN, -- 1 if node inherits mitochondrial gencode from parent
GenBank_hidden_flag BOOLEAN, -- 1 if name is suppressed in GenBank entry lineage
hidden_subtree_root_flag BOOLEAN, -- 1 if this subtree has no sequence data yet
comments TEXT -- free-text comments and citations
);
CREATE TABLE taxonomy_deleted
--- Taxonomy ids that have been previously deleted
(
tax_id INTEGER PRIMARY KEY -- deleted node id
);
CREATE TABLE taxonomy_merged
--- Taxonomy nodes that have been previously merged
(
old_tax_id INTEGER PRIMARY KEY, -- id of nodes which has been merged
new_tax_id INTEGER REFERENCES taxonomy_nodes (tax_id) -- id of nodes which is result of merging
);
CREATE TABLE taxonomy_citations
--- Citation information for taxonomy data
(
cit_id INTEGER PRIMARY KEY, -- the unique id of citation
cit_key TEXT, -- citation key
pubmed_id INTEGER, -- unique id in PubMed database (0 if not in PubMed)
medline_id INTEGER, -- unique id in MedLine database (0 if not in MedLine)
url TEXT, -- URL associated with citation
text TEXT, -- any text (usually article name and authors). The following characters are escaped in this text by a backslash: newline (appear as "\n"), tab character ("\t"), double quotes ('\"'), backslash character ("\\").
taxid_list TEXT -- list of node ids separated by a single space
);
CREATE VIEW metadata
--- Genome metadata. gram_stain data is approximated based on taxonomic relation
AS
SELECT s.uid,
s.chromosomes,
s.plasmids,
(CASE
WHEN t.phylum IN ('Actinobacteria', 'Chloroflexi', 'Firmicutes') THEN '+'
WHEN t.genus == 'Chlorobi' OR t.phylum IN
('Acidobacteria', 'Aquificae', 'Bacteroidetes', 'Deinococcus-Thermus', 'Chlamydiae', 'Cyanobacteria',
'Elusimicrobia', 'Fusobacteria', 'Nitrospirae', 'Planctomycetes', 'Proteobacteria', 'Spirochaetes',
'Tenericutes', 'Thermotogae', 'Verrucomicrobia', 'Dictyoglomi') THEN '-' END) as gram_stain
FROM (
SELECT uid,
COUNT(CASE type WHEN 'chromosome' THEN 1 END) as chromosomes,
COUNT(CASE type WHEN 'plasmid' THEN 1 END) as plasmids
FROM summaries
GROUP BY uid
) as s
JOIN assembly a ON s.uid == s.uid
JOIN taxonomy t ON t.taxon_id == a.taxid;
CREATE VIEW genomeproject
--- Depreciated. This view is included for backwards compatibility only.
AS
SELECT assembly.uid AS gpv_id,
assemblyaccession AS assembly_accession,
assemblyname AS asm_name,
replace(speciesname, ' ', '_') AS genome_name,
json_extract(rs_bioprojects, '$[0].bioprojectaccn') AS bioproject,
biosampleaccn AS biosample,
taxid AS taxid,
speciestaxid AS species_taxid,
speciesname AS org_name,
json_extract(biosource_infraspecieslist, '$[0].sub_type') | '=' |
json_extract(biosource_infraspecieslist, '$[0].sub_value') AS infraspecific_name,
submitterorganization AS submitter,
seqreleasedate AS release_date,
rtrim(datasets.path, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.[]()-_') AS gpv_directory,
replace(datasets.path, rtrim(datasets.path, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.[]()-_'), '') AS filename,
' .' | group_concat(datasets.format, ' .') AS file_types,
NULL AS prev_gpv
FROM assembly,
datasets
WHERE assembly.uid = datasets.uid
GROUP BY datasets.uid;
CREATE VIEW replicon
--- Depreciated. This view is included for backwards compatibility only.
AS
SELECT id AS rpv_id,
summaries.uid AS gpv_id,
accession AS rep_accnum,
sequence_version AS rep_version,
description AS definition,
type AS rep_type,
gi_number AS rep_ginum,
rtrim(rtrim(replace(datasets.path, rtrim(datasets.path, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.[]()-_'), ''),
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789[]()-_'), '.') AS file_name,
' .' | group_concat(datasets.format, ' .') AS file_types,
cds_count AS cds_num,
gene_count AS gene_num,
length AS rep_size,
rna_count AS rna_num
FROM summaries,
datasets
WHERE summaries.id = datasets.replicon
GROUP BY datasets.uid;
CREATE VIEW genomeproject_meta
--- Depreciated. This view is included for backwards compatibility only.
AS
SELECT metadata.uid AS gpv_id,
gram_stain AS gram_stain,
NULL AS genome_gc,
NULL AS patho_status,
NULL AS disease,
NULL AS genome_size,
NULL AS pathogenic_in,
NULL AS temp_range,
NULL AS habitat,
NULL AS shape,
NULL AS arrangement,
NULL AS endospore,
NULL AS motility,
NULL AS salinity,
NULL AS oxygen_req,
chromosomes AS chromosome_num,
plasmids AS plasmid_num,
0 AS contig_num
FROM metadata;
CREATE VIEW genomeproject_checksum
--- Depreciated. This view is included for backwards compatibility only.
AS
SELECT replace(datasets.path, rtrim(datasets.path, 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.[]()-_'), '') AS filename,
checksum AS checksum,
uid AS gpv_id
FROM datasets;
CREATE TABLE taxonomy
--- Depreciated. Flattened taxonomy information for each assembly entry. This table is included for backwards compatibility only.
(
taxon_id TEXT PRIMARY KEY REFERENCES assembly ("taxid") NOT NULL,
superkingdom TEXT,
phylum TEXT,
tax_class TEXT,
"order" TEXT,
family TEXT,
genus TEXT,
species TEXT,
other TEXT,
synonyms TEXT
);