Skip to content

Commit

Permalink
Fix category parsing of quoted headers (#51)
Browse files Browse the repository at this point in the history
* Add a test for category parsing

* Update category parser to handle quoted headers.
  • Loading branch information
sammacbeth authored Jan 29, 2024
1 parent 4d842a2 commit d8f044f
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 20 deletions.
36 changes: 16 additions & 20 deletions src/trackers/helpers/getCategory.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,28 @@
const fs = require('fs')
const _ = require('underscore')
const parse = require('csv-parse/lib/sync')

function getCategories (categoryCSVfilePath) {
const categoryCSV = fs.readFileSync(categoryCSVfilePath, 'utf8').split('\n')
const categoryHeader = categoryCSV.shift()
.replace(/\r/gi, "")
.split(',').slice(1)
const records = parse(fs.readFileSync(categoryCSVfilePath, 'utf8'), {
columns: true,
delimiter: ',',
})
console.log(records)

const domainToCategory = categoryCSV.reduce((obj, row) => {

row = parse(row)[0]
if (!row) {return obj}

const domain = row[0]

// clean up category values. 1 means this is in the category, anything else no idea so skip it
const rowArray = Array.from(row.slice(1)).map(c => {
const domainToCategory = records.reduce((obj, row) => {
const domain = row.domain
obj[domain] = row
delete row.domain
Object.keys(row).forEach(category => {
const c = row[category]
if (c === '1') {
return 1
row[category] = 1
} else if (c === '0' || c === '') {
return 0
row[category] = 0
} else {
console.log(`unknown category value for ${domain}, ${c}`)
row[category] = null
}
console.log(`unknown category value for ${domain}, ${c}`)
return null
})

obj[domain] = _.object(categoryHeader, rowArray)
return obj
}, {})
return domainToCategory
Expand Down
21 changes: 21 additions & 0 deletions test/categories.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
const assert = require('assert')
const {describe, it, before} = require('mocha')
const { getCategories } = require('../src/trackers/helpers/getCategory')

describe('getCategories', () => {
it('generates a domain/category map', () => {
const domainToCategpry = getCategories('test/fixtures/categorized_trackers.csv')
assert.deepStrictEqual(domainToCategpry, {
'ads.com': {
'First Category': 1,
'Advertising': 1,
},
'example.com': {
'First Category': 0,
'Advertising': 1,
}
})
})
})


3 changes: 3 additions & 0 deletions test/fixtures/categorized_trackers.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
domain,"First Category",Advertising
example.com,,1
ads.com,1,1

0 comments on commit d8f044f

Please sign in to comment.