-
Notifications
You must be signed in to change notification settings - Fork 2
/
process.sh
104 lines (90 loc) · 3.16 KB
/
process.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
set -euo pipefail
# -e and -o pipefail will make the script exit
# in case of command failure (or piped command failure)
# -u will exit in case a variable is undefined
# (in you case, if the header is invalid)
if [ $# -eq 0 ] || [ -z "$1" ]
then
DATASET_URLS='dataset_urls0.csv'
echo "No arguments supplied - processing $DATASET_URLS"
else
DATASET_URLS=$1
fi
DATA_DIRECTORY='data'
SCRIPT='process_jsonl.py'
RESET='false'
POSTGRES_DBNAME=
POSTGRES_USER=
POSTGRES_PASSWORD=
POSTGRES_HOST=
POSTGRES_PORT=
if [[ -e ".env" ]]
then
# loading script parameters from .env
set -a
source .env
set +a
if [ -z "$POSTGRES_HOST" ] || [ -z "$POSTGRES_PORT" ] || [ -z "$POSTGRES_DBNAME" ] || [ -z "$POSTGRES_USER" ] || [ -z "$POSTGRES_PASSWORD" ]
then
echo "Not all PostgreSQL paramaters set/loaded from .env. Exiting."
exit 1
fi
else
echo "No .env file with PostgreSQL paramaters found. Exiting."
exit 1
fi
echo "PROCESSING: transforming & loading dataset files from '$DATASET_URLS' to PostgreSQL'..."
# Preparing data directory
if [ ! -d "$DATA_DIRECTORY" ]; then
echo " No '$DATA_DIRECTORY' directory found. Exiting."
exit 1;
fi
# Ingesting data files
while IFS=, read -r table archive url || [ -n "$table" ]; do
# echo $table, $archive, $url
# checking dataset file names
if ! [[ "$archive" =~ .*\.jsonl.gz$ ]] || ! [[ "$url" =~ .*\.jsonl.gz$ ]] ; then
# first line of .csv - header
# not a valid (archive url) pair - skipping
# echo " Unexpected data file $archive. Skipping."
continue
fi
export filename="${archive%.gz}"
if [[ "$filename" =~ .*\.jsonl$ ]] && [[ -e "$DATA_DIRECTORY/$filename" ]]; then
# unpacked file found - processing
echo " Processing $DATA_DIRECTORY/$filename ..."
python $SCRIPT --user $POSTGRES_USER --password $POSTGRES_PASSWORD --host $POSTGRES_HOST --port $POSTGRES_PORT --db $POSTGRES_DBNAME --table_name $table --reset $RESET --source $DATA_DIRECTORY/$filename
if [[ $? -ne 0 ]]; then
# Aborted - stopping
echo " Processing stopped."
exit 1
fi
# processed - next
continue
fi
# unpacked file not found - checking archive
if [[ "$archive" =~ .*\.jsonl.gz$ ]] && [[ -e "$DATA_DIRECTORY/$archive" ]]; then
echo " Archive $DATA_DIRECTORY/$archive found. Testing..."
# should I check archive?
gzip -t $DATA_DIRECTORY/$archive
fi
if [[ $? -ne 0 ]]; then
echo " Testing $DATA_DIRECTORY/$archive failed. Deleting to re-download. Processing stopped."
rm -rf $DATA_DIRECTORY/$archive
exit 1
fi
# archive found - processing
if [[ "$archive" =~ .*\.jsonl.gz$ ]] && [[ -e "$DATA_DIRECTORY/$archive" ]]; then
echo " Processing $DATA_DIRECTORY/$archive ..."
python $SCRIPT --user $POSTGRES_USER --password $POSTGRES_PASSWORD --host $POSTGRES_HOST --port $POSTGRES_PORT --db $POSTGRES_DBNAME --table_name $table --reset $RESET --source $DATA_DIRECTORY/$archive
if [[ $? -ne 0 ]]; then
# Aborted - stopping
echo " Processing stopped."
exit 1
fi
# processed - next
continue
fi
done < $DATA_DIRECTORY/$DATASET_URLS
echo 'PROCESSING finished: OK!'