pay-respects/utils/src/evals.rs

222 lines
6 KiB
Rust
Raw Normal View History

2024-12-29 16:16:13 +01:00
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
use crate::files::*;
2025-01-06 20:23:44 +01:00
use itertools::Itertools;
2025-01-07 01:16:54 +01:00
use regex_lite::Regex;
2025-01-06 15:57:15 +01:00
fn regex_captures(regex: &str, string: &str) -> Vec<String> {
let regex = Regex::new(regex).unwrap();
2025-01-06 15:57:15 +01:00
let mut caps = Vec::new();
for captures in regex.captures_iter(string) {
for cap in captures.iter().skip(1).flatten() {
2025-01-06 15:57:15 +01:00
caps.push(cap.as_str().to_owned());
}
}
2025-01-06 15:57:15 +01:00
caps
}
pub fn opt_regex(regex: &str, command: &mut String) -> String {
let opts = regex_captures(regex, command);
for opt in opts.clone() {
*command = command.replace(&opt, "");
}
opts.join(" ")
}
pub fn err_regex(regex: &str, error_msg: &str) -> String {
2025-01-06 15:57:15 +01:00
let err = regex_captures(regex, error_msg);
err.join(" ")
}
pub fn cmd_regex(regex: &str, command: &str) -> String {
2025-01-06 15:57:15 +01:00
let cmd = regex_captures(regex, command);
cmd.join(" ")
}
2025-04-09 15:53:06 +02:00
/// Returns the output of a shell command as a vector of strings
/// Each string is a line of output
pub fn eval_shell_command(shell: &str, command: &str) -> Vec<String> {
let output = std::process::Command::new(shell)
.arg("-c")
.arg(command)
.output()
.expect("failed to execute process");
let output = String::from_utf8_lossy(&output.stdout);
let split_output = output.split('\n').collect::<Vec<&str>>();
split_output
.iter()
.map(|s| s.trim().to_string())
.collect::<Vec<String>>()
}
2025-04-09 15:53:06 +02:00
/// Split the full command into command and arguments
pub fn split_command(command: &str) -> Vec<String> {
2024-12-08 15:33:43 +01:00
#[cfg(debug_assertions)]
eprintln!("command: {command}");
// this regex splits the command separated by spaces, except when the space
// is escaped by a backslash or surrounded by quotes
let regex = r#"([^\s"'\\]+|"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\\ )+|\\|\n"#;
let regex = Regex::new(regex).unwrap();
let split_command = regex
.find_iter(command)
.map(|cap| cap.as_str().to_owned())
.collect::<Vec<String>>();
split_command
}
pub fn suggest_typo(typos: &[String], candidates: &[String], executables: &[String]) -> String {
let mut suggestions = Vec::new();
for typo in typos {
let typo = typo.as_str();
if candidates.len() == 1 {
match candidates[0].as_str() {
"path" => {
2024-12-09 17:06:00 +01:00
if typo.contains(std::path::MAIN_SEPARATOR) {
if let Some(suggest) = best_match_file(typo) {
suggestions.push(suggest);
} else {
suggestions.push(typo.to_string());
}
continue;
}
if let Some(suggest) = find_similar(typo, executables, Some(2)) {
suggestions.push(suggest);
} else {
suggestions.push(typo.to_string());
}
}
"file" => {
2024-12-09 16:27:39 +01:00
if let Some(suggest) = best_match_file(typo) {
suggestions.push(suggest);
} else {
suggestions.push(typo.to_string());
}
}
2024-12-09 17:06:00 +01:00
_ => {
unreachable!("suggest_typo: must have at least two candidates")
}
}
} else if let Some(suggest) = find_similar(typo, candidates, Some(2)) {
suggestions.push(suggest);
} else {
suggestions.push(typo.to_string());
}
}
suggestions.join(" ")
}
pub fn best_match_path(typo: &str, executables: &[String]) -> Option<String> {
find_similar(typo, executables, Some(3))
}
pub fn best_matches_path(typo: &str, executables: &[String]) -> Option<Vec<String>> {
find_similars(typo, executables, Some(3))
}
2025-04-09 15:53:06 +02:00
/// Find the best match for a typo given a list of candidates
/// higher the threshold, stricter the comparison
/// 1: anything
/// 2: 50%
/// 3: 33%
/// ... etc
pub fn find_similar(typo: &str, candidates: &[String], threshold: Option<usize>) -> Option<String> {
let threshold = threshold.unwrap_or(2);
let mut min_distance = typo.chars().count() / threshold + 1;
let mut min_distance_index = None;
for (i, candidate) in candidates.iter().enumerate() {
if candidate.is_empty() {
continue;
}
let distance = compare_string(typo, candidate);
if distance < min_distance {
min_distance = distance;
min_distance_index = Some(i);
}
}
if let Some(min_distance_index) = min_distance_index {
return Some(candidates[min_distance_index].to_string());
}
None
}
2025-04-09 15:53:06 +02:00
/// Similar to `find_similar`, but returns a vector of all candidates
/// with the same minimum distance
2025-01-06 16:27:11 +01:00
pub fn find_similars(
typo: &str,
candidates: &[String],
threshold: Option<usize>,
) -> Option<Vec<String>> {
let threshold = threshold.unwrap_or(2);
let mut min_distance = typo.chars().count() / threshold + 1;
let mut min_distance_index = vec![];
for (i, candidate) in candidates.iter().enumerate() {
if candidate.is_empty() {
continue;
}
let distance = compare_string(typo, candidate);
2025-01-06 20:23:44 +01:00
use std::cmp::Ordering::*;
match distance.cmp(&min_distance) {
2025-01-06 23:14:13 +01:00
Equal => {
if !min_distance_index.is_empty() {
min_distance_index.push(i)
}
2025-01-07 01:16:54 +01:00
}
2025-01-06 20:23:44 +01:00
Less => {
min_distance = distance;
min_distance_index.clear();
min_distance_index.push(i);
}
_ => {}
}
}
if !min_distance_index.is_empty() {
2025-01-06 16:27:11 +01:00
return Some(
min_distance_index
.iter()
.map(|&i| candidates[i].to_string())
2025-01-06 20:23:44 +01:00
.collect::<Vec<String>>()
.into_iter()
.unique()
2025-01-06 16:27:11 +01:00
.collect(),
);
}
None
}
/// Damerau-Levenshtein distance algorithm
#[allow(clippy::needless_range_loop)]
pub fn compare_string(a: &str, b: &str) -> usize {
let mut matrix = vec![vec![0; b.chars().count() + 1]; a.chars().count() + 1];
for i in 0..a.chars().count() + 1 {
matrix[i][0] = i;
}
for j in 0..b.chars().count() + 1 {
matrix[0][j] = j;
}
for (i, ca) in a.chars().enumerate() {
for (j, cb) in b.chars().enumerate() {
let cost = if ca == cb { 0 } else { 1 };
matrix[i + 1][j + 1] = std::cmp::min(
std::cmp::min(matrix[i][j + 1] + 1, matrix[i + 1][j] + 1),
matrix[i][j] + cost,
);
// addition for optimal string alignment distance
if i > 0
&& j > 0 && ca == b.chars().nth(j - 1).unwrap()
&& a.chars().nth(i - 1).unwrap() == cb
{
matrix[i + 1][j + 1] =
std::cmp::min(matrix[i + 1][j + 1], matrix[i - 1][j - 1] + 1);
}
}
}
matrix[a.chars().count()][b.chars().count()]
}