diff --git a/src/trackers/helpers/getCategory.js b/src/trackers/helpers/getCategory.js index c871a3e..107c35f 100644 --- a/src/trackers/helpers/getCategory.js +++ b/src/trackers/helpers/getCategory.js @@ -1,32 +1,28 @@ const fs = require('fs') -const _ = require('underscore') const parse = require('csv-parse/lib/sync') function getCategories (categoryCSVfilePath) { - const categoryCSV = fs.readFileSync(categoryCSVfilePath, 'utf8').split('\n') - const categoryHeader = categoryCSV.shift() - .replace(/\r/gi, "") - .split(',').slice(1) + const records = parse(fs.readFileSync(categoryCSVfilePath, 'utf8'), { + columns: true, + delimiter: ',', + }) + console.log(records) - const domainToCategory = categoryCSV.reduce((obj, row) => { - - row = parse(row)[0] - if (!row) {return obj} - - const domain = row[0] - - // clean up category values. 1 means this is in the category, anything else no idea so skip it - const rowArray = Array.from(row.slice(1)).map(c => { + const domainToCategory = records.reduce((obj, row) => { + const domain = row.domain + obj[domain] = row + delete row.domain + Object.keys(row).forEach(category => { + const c = row[category] if (c === '1') { - return 1 + row[category] = 1 } else if (c === '0' || c === '') { - return 0 + row[category] = 0 + } else { + console.log(`unknown category value for ${domain}, ${c}`) + row[category] = null } - console.log(`unknown category value for ${domain}, ${c}`) - return null }) - - obj[domain] = _.object(categoryHeader, rowArray) return obj }, {}) return domainToCategory diff --git a/test/categories.test.js b/test/categories.test.js new file mode 100644 index 0000000..02797a0 --- /dev/null +++ b/test/categories.test.js @@ -0,0 +1,21 @@ +const assert = require('assert') +const {describe, it, before} = require('mocha') +const { getCategories } = require('../src/trackers/helpers/getCategory') + +describe('getCategories', () => { + it('generates a domain/category map', () => { + const domainToCategpry = getCategories('test/fixtures/categorized_trackers.csv') + assert.deepStrictEqual(domainToCategpry, { + 'ads.com': { + 'First Category': 1, + 'Advertising': 1, + }, + 'example.com': { + 'First Category': 0, + 'Advertising': 1, + } + }) + }) +}) + + diff --git a/test/fixtures/categorized_trackers.csv b/test/fixtures/categorized_trackers.csv new file mode 100644 index 0000000..ca04b1e --- /dev/null +++ b/test/fixtures/categorized_trackers.csv @@ -0,0 +1,3 @@ +domain,"First Category",Advertising +example.com,,1 +ads.com,1,1 \ No newline at end of file