helix-mods/helix-core/src/shellwords.rs
Mike Trinkala 62d046fa21
Fix utf8 length handling for shellwords (#5738)
If the last argument to shellwords ends in a multibyte utf8 character
the entire argument will be dropped.
e.g. `:sh echo test1 test2𒀀` will only output `test1`

Add additional tests based on the code review feedback
2023-02-01 16:07:42 -06:00

350 lines
11 KiB
Rust

use std::borrow::Cow;
/// Auto escape for shellwords usage.
pub fn escape(input: Cow<str>) -> Cow<str> {
if !input.chars().any(|x| x.is_ascii_whitespace()) {
input
} else if cfg!(unix) {
Cow::Owned(input.chars().fold(String::new(), |mut buf, c| {
if c.is_ascii_whitespace() {
buf.push('\\');
}
buf.push(c);
buf
}))
} else {
Cow::Owned(format!("\"{}\"", input))
}
}
enum State {
OnWhitespace,
Unquoted,
UnquotedEscaped,
Quoted,
QuoteEscaped,
Dquoted,
DquoteEscaped,
}
pub struct Shellwords<'a> {
state: State,
/// Shellwords where whitespace and escapes has been resolved.
words: Vec<Cow<'a, str>>,
/// The parts of the input that are divided into shellwords. This can be
/// used to retrieve the original text for a given word by looking up the
/// same index in the Vec as the word in `words`.
parts: Vec<&'a str>,
}
impl<'a> From<&'a str> for Shellwords<'a> {
fn from(input: &'a str) -> Self {
use State::*;
let mut state = Unquoted;
let mut words = Vec::new();
let mut parts = Vec::new();
let mut escaped = String::with_capacity(input.len());
let mut part_start = 0;
let mut unescaped_start = 0;
let mut end = 0;
for (i, c) in input.char_indices() {
state = match state {
OnWhitespace => match c {
'"' => {
end = i;
Dquoted
}
'\'' => {
end = i;
Quoted
}
'\\' => {
if cfg!(unix) {
escaped.push_str(&input[unescaped_start..i]);
unescaped_start = i + 1;
UnquotedEscaped
} else {
OnWhitespace
}
}
c if c.is_ascii_whitespace() => {
end = i;
OnWhitespace
}
_ => Unquoted,
},
Unquoted => match c {
'\\' => {
if cfg!(unix) {
escaped.push_str(&input[unescaped_start..i]);
unescaped_start = i + 1;
UnquotedEscaped
} else {
Unquoted
}
}
c if c.is_ascii_whitespace() => {
end = i;
OnWhitespace
}
_ => Unquoted,
},
UnquotedEscaped => Unquoted,
Quoted => match c {
'\\' => {
if cfg!(unix) {
escaped.push_str(&input[unescaped_start..i]);
unescaped_start = i + 1;
QuoteEscaped
} else {
Quoted
}
}
'\'' => {
end = i;
OnWhitespace
}
_ => Quoted,
},
QuoteEscaped => Quoted,
Dquoted => match c {
'\\' => {
if cfg!(unix) {
escaped.push_str(&input[unescaped_start..i]);
unescaped_start = i + 1;
DquoteEscaped
} else {
Dquoted
}
}
'"' => {
end = i;
OnWhitespace
}
_ => Dquoted,
},
DquoteEscaped => Dquoted,
};
let c_len = c.len_utf8();
if i == input.len() - c_len && end == 0 {
end = i + c_len;
}
if end > 0 {
let esc_trim = escaped.trim();
let inp = &input[unescaped_start..end];
if !(esc_trim.is_empty() && inp.trim().is_empty()) {
if esc_trim.is_empty() {
words.push(inp.into());
parts.push(inp);
} else {
words.push([escaped, inp.into()].concat().into());
parts.push(&input[part_start..end]);
escaped = "".to_string();
}
}
unescaped_start = i + 1;
part_start = i + 1;
end = 0;
}
}
debug_assert!(words.len() == parts.len());
Self {
state,
words,
parts,
}
}
}
impl<'a> Shellwords<'a> {
/// Checks that the input ends with a whitespace character which is not escaped.
///
/// # Examples
///
/// ```rust
/// use helix_core::shellwords::Shellwords;
/// assert_eq!(Shellwords::from(" ").ends_with_whitespace(), true);
/// assert_eq!(Shellwords::from(":open ").ends_with_whitespace(), true);
/// assert_eq!(Shellwords::from(":open foo.txt ").ends_with_whitespace(), true);
/// assert_eq!(Shellwords::from(":open").ends_with_whitespace(), false);
/// #[cfg(unix)]
/// assert_eq!(Shellwords::from(":open a\\ ").ends_with_whitespace(), false);
/// #[cfg(unix)]
/// assert_eq!(Shellwords::from(":open a\\ b.txt").ends_with_whitespace(), false);
/// ```
pub fn ends_with_whitespace(&self) -> bool {
matches!(self.state, State::OnWhitespace)
}
/// Returns the list of shellwords calculated from the input string.
pub fn words(&self) -> &[Cow<'a, str>] {
&self.words
}
/// Returns a list of strings which correspond to [`Self::words`] but represent the original
/// text in the input string - including escape characters - without separating whitespace.
pub fn parts(&self) -> &[&'a str] {
&self.parts
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
#[cfg(windows)]
fn test_normal() {
let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#;
let shellwords = Shellwords::from(input);
let result = shellwords.words().to_vec();
let expected = vec![
Cow::from(":o"),
Cow::from("single_word"),
Cow::from("twó"),
Cow::from("wörds"),
Cow::from("\\three\\"),
Cow::from("\\"),
Cow::from("with\\ escaping\\\\"),
];
// TODO test is_owned and is_borrowed, once they get stabilized.
assert_eq!(expected, result);
}
#[test]
#[cfg(unix)]
fn test_normal() {
let input = r#":o single_word twó wörds \three\ \"with\ escaping\\"#;
let shellwords = Shellwords::from(input);
let result = shellwords.words().to_vec();
let expected = vec![
Cow::from(":o"),
Cow::from("single_word"),
Cow::from("twó"),
Cow::from("wörds"),
Cow::from(r#"three "with escaping\"#),
];
// TODO test is_owned and is_borrowed, once they get stabilized.
assert_eq!(expected, result);
}
#[test]
#[cfg(unix)]
fn test_quoted() {
let quoted =
r#":o 'single_word' 'twó wörds' '' ' ''\three\' \"with\ escaping\\' 'quote incomplete"#;
let shellwords = Shellwords::from(quoted);
let result = shellwords.words().to_vec();
let expected = vec![
Cow::from(":o"),
Cow::from("single_word"),
Cow::from("twó wörds"),
Cow::from(r#"three' "with escaping\"#),
Cow::from("quote incomplete"),
];
assert_eq!(expected, result);
}
#[test]
#[cfg(unix)]
fn test_dquoted() {
let dquoted = r#":o "single_word" "twó wörds" "" " ""\three\' \"with\ escaping\\" "dquote incomplete"#;
let shellwords = Shellwords::from(dquoted);
let result = shellwords.words().to_vec();
let expected = vec![
Cow::from(":o"),
Cow::from("single_word"),
Cow::from("twó wörds"),
Cow::from(r#"three' "with escaping\"#),
Cow::from("dquote incomplete"),
];
assert_eq!(expected, result);
}
#[test]
#[cfg(unix)]
fn test_mixed() {
let dquoted = r#":o single_word 'twó wörds' "\three\' \"with\ escaping\\""no space before"'and after' $#%^@ "%^&(%^" ')(*&^%''a\\\\\b' '"#;
let shellwords = Shellwords::from(dquoted);
let result = shellwords.words().to_vec();
let expected = vec![
Cow::from(":o"),
Cow::from("single_word"),
Cow::from("twó wörds"),
Cow::from("three' \"with escaping\\"),
Cow::from("no space before"),
Cow::from("and after"),
Cow::from("$#%^@"),
Cow::from("%^&(%^"),
Cow::from(")(*&^%"),
Cow::from(r#"a\\b"#),
//last ' just changes to quoted but since we dont have anything after it, it should be ignored
];
assert_eq!(expected, result);
}
#[test]
fn test_lists() {
let input =
r#":set statusline.center ["file-type","file-encoding"] '["list", "in", "qoutes"]'"#;
let shellwords = Shellwords::from(input);
let result = shellwords.words().to_vec();
let expected = vec![
Cow::from(":set"),
Cow::from("statusline.center"),
Cow::from(r#"["file-type","file-encoding"]"#),
Cow::from(r#"["list", "in", "qoutes"]"#),
];
assert_eq!(expected, result);
}
#[test]
#[cfg(unix)]
fn test_escaping_unix() {
assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar"));
assert_eq!(escape("foo bar".into()), Cow::Borrowed("foo\\ bar"));
assert_eq!(escape("foo\tbar".into()), Cow::Borrowed("foo\\\tbar"));
}
#[test]
#[cfg(windows)]
fn test_escaping_windows() {
assert_eq!(escape("foobar".into()), Cow::Borrowed("foobar"));
assert_eq!(escape("foo bar".into()), Cow::Borrowed("\"foo bar\""));
}
#[test]
#[cfg(unix)]
fn test_parts() {
assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]);
assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\ "]);
}
#[test]
#[cfg(windows)]
fn test_parts() {
assert_eq!(Shellwords::from(":o a").parts(), &[":o", "a"]);
assert_eq!(Shellwords::from(":o a\\ ").parts(), &[":o", "a\\"]);
}
#[test]
fn test_multibyte_at_end() {
assert_eq!(Shellwords::from("𒀀").parts(), &["𒀀"]);
assert_eq!(
Shellwords::from(":sh echo 𒀀").parts(),
&[":sh", "echo", "𒀀"]
);
assert_eq!(
Shellwords::from(":sh echo 𒀀 hello world𒀀").parts(),
&[":sh", "echo", "𒀀", "hello", "world𒀀"]
);
}
}