From ae9f610663e77221464b316c56e1c04ad4c76f7b Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Sun, 26 May 2024 02:14:26 +0530 Subject: [PATCH] SimpleChat:DU: Bring in maxType to the mix along with maxUniq Allow for more uniq chars, but then ensure that a given type of char ie numerals or alphabets or other types dont cross the specified maxType limit. This allows intermixed text garbage to be identified and trimmed. --- .../server/public_simplechat/datautils.mjs | 31 ++++++++++++++++--- .../server/public_simplechat/simplechat.js | 2 +- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/examples/server/public_simplechat/datautils.mjs b/examples/server/public_simplechat/datautils.mjs index b42d58555..23d4d0f48 100644 --- a/examples/server/public_simplechat/datautils.mjs +++ b/examples/server/public_simplechat/datautils.mjs @@ -72,14 +72,26 @@ export function trim_repeat_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThresho /** * A simple minded try trim garbage at end using histogram characteristics + * + * Allow the garbage to contain upto maxUniq chars, but at the same time ensure that + * a given type of char ie numerals or alphabets or other types dont cross the specified + * maxType limit. This allows intermixed text garbage to be identified and trimmed. + * + * ALERT: This is not perfect and only provides a rough garbage identification logic. + * Also it currently only differentiates between character classes wrt english. + * * @param {string} sIn + * @param {number} maxType * @param {number} maxUniq * @param {number} maxMatchLenThreshold */ -export function trim_hist_garbage_at_end(sIn, maxUniq, maxMatchLenThreshold) { +export function trim_hist_garbage_at_end(sIn, maxType, maxUniq, maxMatchLenThreshold) { if (sIn.length < maxMatchLenThreshold) { return { trimmed: false, data: sIn }; } + let iAlp = 0; + let iNum = 0; + let iOth = 0; // Learn let hist = {}; let iUniq = 0; @@ -88,6 +100,13 @@ export function trim_hist_garbage_at_end(sIn, maxUniq, maxMatchLenThreshold) { if (c in hist) { hist[c] += 1; } else { + if(c.match(/[0-9]/) != null) { + iNum += 1; + } else if(c.match(/[A-Za-z]/) != null) { + iAlp += 1; + } else { + iOth += 1; + } iUniq += 1; if (iUniq >= maxUniq) { break; @@ -96,6 +115,9 @@ export function trim_hist_garbage_at_end(sIn, maxUniq, maxMatchLenThreshold) { } } console.log("DBUG:TrimHistGarbage:", hist); + if ((iAlp > maxType) || (iNum > maxType) || (iOth > maxType)) { + return { trimmed: false, data: sIn }; + } // Catch and Trim for(let i=0; i < sIn.length; i++) { let c = sIn[sIn.length-1-i]; @@ -112,13 +134,14 @@ export function trim_hist_garbage_at_end(sIn, maxUniq, maxMatchLenThreshold) { /** * Keep trimming repeatedly using hist_garbage logic, till you no longer can * @param {any} sIn - * @param {number} maxSubL + * @param {number} maxType + * @param {number} maxUniq * @param {number} maxMatchLenThreshold */ -export function trim_hist_garbage_at_end_loop(sIn, maxSubL, maxMatchLenThreshold) { +export function trim_hist_garbage_at_end_loop(sIn, maxType, maxUniq, maxMatchLenThreshold) { let sCur = sIn; while (true) { - let got = trim_hist_garbage_at_end(sCur, maxSubL, maxMatchLenThreshold); + let got = trim_hist_garbage_at_end(sCur, maxType, maxUniq, maxMatchLenThreshold); if (!got.trimmed) { return got.data; } diff --git a/examples/server/public_simplechat/simplechat.js b/examples/server/public_simplechat/simplechat.js index c09ca73b1..3daba16bd 100644 --- a/examples/server/public_simplechat/simplechat.js +++ b/examples/server/public_simplechat/simplechat.js @@ -481,7 +481,7 @@ class MultiChatUI { assistantMsg = respBody["content"]; } } - assistantMsg = du.trim_hist_garbage_at_end_loop(assistantMsg, 12, 72); + assistantMsg = du.trim_hist_garbage_at_end_loop(assistantMsg, 8, 16, 72); chat.add(Roles.Assistant, assistantMsg); if (chatId == this.curChatId) { chat.show(this.elDivChat);