SimpleChat:DU: Switch trim garbage hist based to maxUniq simple
Instead of blindly building histogram for specified substring length, and then checking if any new char within specified min garbage length limit, NOW exit learn state when specified maxUniq chars are found. Inturn there should be no new chars with in the specified min garbage length required limit. TODO: Need to track char classes like alphabets, numerals and special/other chars.
This commit is contained in:
parent
f33aa28149
commit
d1e73d8777
1 changed files with 8 additions and 3 deletions
|
@ -73,20 +73,25 @@ export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThresho
|
|||
/**
|
||||
* A simple minded try trim garbage at end using histogram characteristics
|
||||
* @param {string} sIn
|
||||
* @param {number} maxSubL
|
||||
* @param {number} maxUniq
|
||||
* @param {number} maxMatchLenThreshold
|
||||
*/
|
||||
export function trim_hist_garbage_at_end(sIn, maxSubL, maxMatchLenThreshold) {
|
||||
export function trim_hist_garbage_at_end(sIn, maxUniq, maxMatchLenThreshold) {
|
||||
if (sIn.length < maxMatchLenThreshold) {
|
||||
return { trimmed: false, data: sIn };
|
||||
}
|
||||
// Learn
|
||||
let hist = {};
|
||||
for(let i=0; i<maxSubL; i++) {
|
||||
let iUniq = 0;
|
||||
for(let i=0; i<maxMatchLenThreshold; i++) {
|
||||
let c = sIn[sIn.length-1-i];
|
||||
if (c in hist) {
|
||||
hist[c] += 1;
|
||||
} else {
|
||||
iUniq += 1;
|
||||
if (iUniq >= maxUniq) {
|
||||
break;
|
||||
}
|
||||
hist[c] = 1;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue