forked from minkymorgan/bytefreq-awk
-
Notifications
You must be signed in to change notification settings - Fork 3
/
test.sh
144 lines (99 loc) · 5.19 KB
/
test.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
############
############
############ bytefreq DQ TEST Script, also useful for USER TRAINING
############ author: Andrew Morgan
############ license GPLv3
############
############
if [ -n "$1" ]; then
runtime=$1
else
runtime="gawk"
fi
echo "ByteFreq: Mask Based Data Profiler"
echo ""
echo "#### "${runtime}
# This is the test files to generate each of the report types for 100k of companies house data, from the UK. Which has issues.
# It also walks through the process of examining data quality on a new file.
## Prepare and reset the test suite
echo "## Reset test files and directories"
rm testdata/*.csv
rm testdata/*.pip
rm out/*
## Download the companies house data which is provided in csv + enclosures format, like excel produces. (CSV is ALWAYS a terrible mistake - pls avoid).
echo "checking if you have the test data from companies house, if not we'll download it"
# if the file doesn't exist, if it doesn't go get it
if [ ! -f "testdata/BasicCompanyData-part6.zip" ]
then
echo "## Fetching the companies house data using wget"
wget http://download.companieshouse.gov.uk/BasicCompanyData-part6.zip
mv BasicCompanyData-part6.zip testdata/.
fi
## Unzip the data.
echo "## unzipping the data"
cd testdata
unzip BasicCompanyData-part6.zip
cd ..
for yourfiles in `ls testdata/*.csv`; do
## User charfreq to find out what is in the file. Is it all extended ascii? Maybe not!
echo "## use charfreq to study the whole raw file before we parse it. Could take a minute or two"
echo "## what can we learn?"
#od -cb ${yourfiles} | ${runtime} -f charfreq.awk | sort -n > charfreq.rpt.txt
echo ""
echo "==============================================================================="
echo "here is the final character frequency analysis report"
echo "==============================================================================="
cat charfreq.rpt.txt
echo ""
echo ""
echo "======================================================"
echo "Let's examine key areas of it to make conclusions: "
echo "======================================================"
echo ""
echo "Is this file LF, or CR, LF/CR delimited? anything odd?"
cat charfreq.rpt.txt | grep "0x0A\|0x0D"
echo ""
echo "======================================================"
echo "Check below matching numbers for opening/closing chars"
cat charfreq.rpt.txt | grep -i "left\|right"
echo ""
echo "======================================================"
echo "Check below for potential enclosure issues, preparser "
cat charfreq.rpt.txt | grep -i quo
echo ""
echo "======================================================"
echo "check below for things having pairs... matched? "
cat charfreq.rpt.txt | grep -i "left\|right"
## Use charfreq to suggest a good alternative delimiter. I suggest Pipe "|" delimited data is easy, clean, best. Does charfreq prove this?
echo ""
echo "======================================================"
echo "## Any good delimiter choices not in the raw data? "
cat charfreq.rpt.txt | grep -i "vertical bar\|Record Separator\|Inverted exclamation mark\|Horizontal Tab"
echo ""
echo "===end of review==============================================================="
## if wanted you can downsample the file - a random ~100k will be fine. Do this using "1 in N" records is easiest/effective with cap at 100k.
##
# ${runtime} 'NR%5==0' ${yourfiles} | head -100000 > downsample.csv
echo ""
echo "Now start to parse the raw csv file using python into pipe delimited"
echo "(there are some helper python scripts in the parser directory you can edit to make your own)"
echo ""
# Becuase we DO NOT TRUST PARSING the csv data - we need to profile the data AFTER the parsing.
# So we prepare our data into PARSED data to test things - As we want to read the data later using python, we will convert it using python.
python3 parsers/csv2pipe.py ${yourfiles}
# Finally we do the data profiling - to test the quality of the data, as processed by our python PARSER.
# Do it by reading in the file and generating the profiling data.
echo "GENERATE HUMAN READABLE REPORT - popular eyeball inspection report"
time ${runtime} -F"|" -f bytefreq_v1.05.awk -v header="1" -v report="1" -v grain="L" ${yourfiles}.pip >out/UkCompanySample.rpt1.txt
echo "GENERATE DATABASE LOADABLE REPORT SUMMARY OUTPUTS - for automation, used for drift in quality analysis"
time ${runtime} -F"|" -f bytefreq_v1.05.awk -v header="1" -v report="0" -v grain="L" ${yourfiles}.pip > out/UkCompanySample.rpt0.txt
echo "#GENERATE DATABASE LOADABLE RAW+PROFILED DATA - for manual cleansing, find bad datapoints and fix them"
time ${runtime} -F"|" -f bytefreq_v1.05.awk -v header="1" -v report="2" -v grain="L" ${yourfiles}.pip > out/UkCompanySample.raw2.txt
echo "GENERATE DATABASE LOADABLE LONGFORMAT RAW DATA - for automated remediation"
time ${runtime} -F"|" -f bytefreq_v1.05.awk -v header="1" -v report="3" -v grain="L" ${yourfiles}.pip > out/UkCompanySample.raw3.txt
echo "SUCCESS !"
# Now you have all the things you need to eyeball the quality, study drift over time, find and propose fixes, to automate correcting bad data points
# all that is left to do, is to use your new understanding to construct automated data quality tools that sit inline the data pipelines
# enjoy
# Andrew
done