Skip to content

Commit

Permalink
Add trim/0, ltrim/0 and rtrim/0 that trims leading and trailing white…
Browse files Browse the repository at this point in the history
…space (#3056)
  • Loading branch information
wader authored Mar 20, 2024
1 parent 81f4f88 commit be437ec
Show file tree
Hide file tree
Showing 7 changed files with 144 additions and 0 deletions.
19 changes: 19 additions & 0 deletions docs/content/manual/manual.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1772,6 +1772,25 @@ sections:
input: '["fo", "foo", "barfoo", "foobar", "foob"]'
output: ['["fo","","bar","foobar","foob"]']

- title: "`trim`, `ltrim`, `rtrim`"
body: |
`trim` trims both leading and trailing whitespace.
`ltrim` trims only leading (left side) whitespace.
`rtrim` trims only trailing (right side) whitespace.
Whitespace characters are the usual `" "`, `"\n"` `"\t"`, `"\r"`
and also all characters in the Unicode character database with the
whitespace property. Note that what considers whitespace might
change in the future.
examples:
- program: 'trim, ltrim, rtrim'
input: '" abc "'
output: ['"abc"', '"abc "', '" abc"']

- title: "`explode`"
body: |
Expand Down
24 changes: 24 additions & 0 deletions jq.1.prebuilt

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 55 additions & 0 deletions src/builtin.c
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,58 @@ static jv f_string_indexes(jq_state *jq, jv a, jv b) {
return jv_string_indexes(a, b);
}

enum trim_op {
TRIM_LEFT = 1 << 0,
TRIM_RIGHT = 1 << 1
};

static jv string_trim(jv a, int op) {
if (jv_get_kind(a) != JV_KIND_STRING) {
return ret_error(a, jv_string("trim input must be a string"));
}

int len = jv_string_length_bytes(jv_copy(a));
const char *start = jv_string_value(a);
const char *trim_start = start;
const char *end = trim_start + len;
const char *trim_end = end;
int c;

if (op & TRIM_LEFT) {
for (;;) {
const char *ns = jvp_utf8_next(trim_start, end, &c);
if (!ns || !jvp_codepoint_is_whitespace(c))
break;
trim_start = ns;
}
}

// make sure not empty string or start trim has trimmed everything
if ((op & TRIM_RIGHT) && trim_end > trim_start) {
for (;;) {
const char *ns = jvp_utf8_backtrack(trim_end-1, trim_start, NULL);
jvp_utf8_next(ns, trim_end, &c);
if (!jvp_codepoint_is_whitespace(c))
break;
trim_end = ns;
if (ns == trim_start)
break;
}
}

// no new string needed if there is nothing to trim
if (trim_start == start && trim_end == end)
return a;

jv ts = jv_string_sized(trim_start, trim_end - trim_start);
jv_free(a);
return ts;
}

static jv f_string_trim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT | TRIM_RIGHT); }
static jv f_string_ltrim(jq_state *jq, jv a) { return string_trim(a, TRIM_LEFT); }
static jv f_string_rtrim(jq_state *jq, jv a) { return string_trim(a, TRIM_RIGHT); }

static jv f_string_implode(jq_state *jq, jv a) {
if (jv_get_kind(a) != JV_KIND_ARRAY) {
return ret_error(a, jv_string("implode input must be an array"));
Expand Down Expand Up @@ -1721,6 +1773,9 @@ BINOPS
{f_string_explode, "explode", 1},
{f_string_implode, "implode", 1},
{f_string_indexes, "_strindices", 2},
{f_string_trim, "trim", 1},
{f_string_ltrim, "ltrim", 1},
{f_string_rtrim, "rtrim", 1},
{f_setpath, "setpath", 3}, // FIXME typechecking
{f_getpath, "getpath", 2},
{f_delpaths, "delpaths", 2},
Expand Down
18 changes: 18 additions & 0 deletions src/jv_unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,21 @@ int jvp_utf8_encode(int codepoint, char* out) {
assert(out - start == jvp_utf8_encode_length(codepoint));
return out - start;
}

// characters with White_Space property in:
// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
int jvp_codepoint_is_whitespace(int c) {
return
(c >= 0x0009 && c <= 0x000D) || // <control-0009>..<control-000D>
c == 0x0020 || // SPACE
c == 0x0085 || // <control-0085>
c == 0x00A0 || // NO-BREAK SPACE
c == 0x1680 || // OGHAM SPACE MARK
(c >= 0x2000 && c <= 0x200A) || // EN QUAD..HAIR SPACE
c == 0x2028 || // LINE SEPARATOR
c == 0x2029 || // PARAGRAPH SEPARATOR
c == 0x202F || // NARROW NO-BREAK SPACE
c == 0x205F || // MEDIUM MATHEMATICAL SPACE
c == 0x3000 // IDEOGRAPHIC SPACE
;
}
2 changes: 2 additions & 0 deletions src/jv_unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ int jvp_utf8_decode_length(char startchar);

int jvp_utf8_encode_length(int codepoint);
int jvp_utf8_encode(int codepoint, char* out);

int jvp_codepoint_is_whitespace(int c);
#endif
20 changes: 20 additions & 0 deletions tests/jq.test
Original file line number Diff line number Diff line change
Expand Up @@ -1334,6 +1334,26 @@ split("")
"xababababax"
[1,7,[1,3,5,7]]

# trim
# \u000b is vertical tab (\v not supported by json)
map(trim), map(ltrim), map(rtrim)
[" \n\t\r\f\u000b", ""," ", "a", " a ", "abc", " abc ", " abc", "abc "]
["", "", "", "a", "a", "abc", "abc", "abc", "abc"]
["", "", "", "a", "a ", "abc", "abc ", "abc", "abc "]
["", "", "", "a", " a", "abc", " abc", " abc", "abc"]

trim, ltrim, rtrim
"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
"abc"
"abc\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000"
"\u0009\u000A\u000B\u000C\u000D\u0020\u0085\u00A0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u2028\u2029\u202F\u205F\u3000abc"

try trim catch ., try ltrim catch ., try rtrim catch .
123
"trim input must be a string"
"trim input must be a string"
"trim input must be a string"

indices(1)
[0,1,1,2,3,4,1,5]
[1,2,6]
Expand Down
6 changes: 6 additions & 0 deletions tests/man.test

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit be437ec

Please sign in to comment.