SimpleChat:DU: Switch trim garbage hist based to maxUniq simple

Instead of blindly building histogram for specified substring
length, and then checking if any new char within specified min
garbage length limit, NOW exit learn state when specified maxUniq
chars are found. Inturn there should be no new chars with in
the specified min garbage length required limit.

TODO: Need to track char classes like alphabets, numerals and
special/other chars.
This commit is contained in:
HanishKVC 2024-05-26 01:57:28 +05:30
parent f33aa28149
commit d1e73d8777

View file

@ -73,20 +73,25 @@ export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThresho
/**
* A simple minded try trim garbage at end using histogram characteristics
* @param {string} sIn
* @param {number} maxSubL
* @param {number} maxUniq
* @param {number} maxMatchLenThreshold
*/
export function trim_hist_garbage_at_end(sIn, maxSubL, maxMatchLenThreshold) {
export function trim_hist_garbage_at_end(sIn, maxUniq, maxMatchLenThreshold) {
if (sIn.length < maxMatchLenThreshold) {
return { trimmed: false, data: sIn };
}
// Learn
let hist = {};
for(let i=0; i<maxSubL; i++) {
let iUniq = 0;
for(let i=0; i<maxMatchLenThreshold; i++) {
let c = sIn[sIn.length-1-i];
if (c in hist) {
hist[c] += 1;
} else {
iUniq += 1;
if (iUniq >= maxUniq) {
break;
}
hist[c] = 1;
}
}