From bb3fe48c1649e46161d648d04f44010c9c7a45af Mon Sep 17 00:00:00 2001 From: HanishKVC Date: Wed, 15 May 2024 19:20:38 +0530 Subject: [PATCH] SimpCfg+DataUtilsString: Move string helpers to its own file --- common/datautils_string.hpp | 220 ++++++++++++++++++++++++++++++++++++ common/simpcfg.hpp | 216 +---------------------------------- 2 files changed, 223 insertions(+), 213 deletions(-) create mode 100644 common/datautils_string.hpp diff --git a/common/datautils_string.hpp b/common/datautils_string.hpp new file mode 100644 index 000000000..b1488d40b --- /dev/null +++ b/common/datautils_string.hpp @@ -0,0 +1,220 @@ +#pragma once + +/** + * A bunch of helper routines to work with strings. + * by Humans for All + * + * ## Some notes for later + * + * NativeCharSize encoded char refers to chars which fit within the size of char type in a given + * type of c++ string or base bitsize of a encoding standard, like 1 byte in case of std::string, + * utf-8, ... + * * example english alphabets in utf-8 encoding space are 1byte chars, in its variable length + * encoding space. + * + * MultiNativeCharSize encoded char refers to chars which occupy multiple base-char-bit-size of + * a c++ string type or char encoding standard. + * * example indian scripts alphabets in utf-8 encoding space occupy multiple bytes in its variable + * length encoding space. + * + * Sane variable length encoding - refers to encoding where the values of NativeCharSized chars of + * a char encoding space cant overlap with values in NativeCharSize subparts of MultiNativeCharSized + * chars of the same char encoding standard. + * * utf-8 shows this behaviour + * * chances are utf-16 and utf-32 also show this behaviour (need to cross check once) + * +*/ + +#include + +#include "log.h" + + +#undef DUS_DEBUG_VERBOSE + +#undef DUS_STR_OVERSMART +#ifdef DUS_STR_OVERSMART +#define str_trim str_trim_oversmart +#else +#define str_trim str_trim_dumb +#endif + + +inline size_t wcs_to_mbs(std::string &sDest, const std::wstring &wSrc) { + std::mbstate_t mbState = std::mbstate_t(); + const wchar_t *wSrcP = wSrc.c_str(); + auto reqLen = std::wcsrtombs(nullptr, &wSrcP, 0, &mbState); + if (reqLen == static_cast(-1)) { + throw std::runtime_error("ERRR:WCS2MBS:Failed probing of size..."); + } + sDest.resize(reqLen); + return std::wcsrtombs(sDest.data(), &wSrcP, sDest.length(), &mbState); +} + +inline size_t mbs_to_wcs(std::wstring &wDest, const std::string &sSrc) { + std::mbstate_t mbState = std::mbstate_t(); + const char *sSrcP = sSrc.c_str(); + auto reqLen = std::mbsrtowcs(nullptr, &sSrcP, 0, &mbState); + if (reqLen == static_cast(-1)) { + throw std::runtime_error("ERRR:MBS2WCS:Failed probing of size..."); + } + wDest.resize(reqLen); + return std::mbsrtowcs(wDest.data(), &sSrcP, wDest.length(), &mbState); +} + +template +inline void dumphex_string(const TString &sIn, const std::string &msgTag){ + LDBUG("%s[ ", msgTag.c_str()); + for(auto c: sIn) { + auto cSize = sizeof(c); + if (cSize == 1) { + LDBUG("%02x, ", (uint8_t)c); + } else if (cSize == 2) { + LDBUG("%04x, ", (uint16_t)c); + } else if (cSize == 4) { + LDBUG("%08x, ", (uint32_t)c); + } else { + std::stringstream ss; + ss << "ERRR:" << __func__ << ":Unsupported char type with size [" << cSize << "]"; + throw std::runtime_error( ss.str().c_str() ); + } + } + LDBUG_LN(" ]"); +} + +// Remove chars from begin and end of the passed string, provided the char +// belongs to one of the chars in trimChars. +// +// NOTE: This will work perfectly provided the string being trimmed as well as +// chars being trimmed are made up of NativeCharSize chars from same encoded space. +// For utf-8, this means the ascii equivalent 1byteSized chars of utf8 and not +// variable length MultiNativeCharSize (ie multibye in case of utf-8) ones. +// NOTE: It will also work, if atleast either end of string as well as trimChars +// have NativeCharSize chars from their encoding space, rather than variable +// length MultiNativeCharSize based chars if any. There needs to be NativeCharSized +// chars beyond any chars that get trimmed, on either side. +// +// NOTE: Given the way UTF-8 char encoding is designed, where NativeCharSize 1byte +// encoded chars are fully unique and dont overlap with any bytes from any of the +// variable length MultiNativeCharSize encoded chars in the utf-8 space, so as long as +// the trimChars belong to NativeCharSize chars subset, the logic should work, even +// if string has a mixture of NativeCharSize and MultiNativeCharSize encoded chars. +// Chances are utf-16 and utf-32 also have similar characteristics wrt thier +// NativeCharSize encoded chars (ie those fully encoded within single 16bit and 32bit +// value respectively), and so equivalent semantic applies to them also. +// +// ALERT: Given that this simple minded logic, works at individual NativeCharSize level +// only, If trimChars involve variable length MultiNativeCharSize encoded chars, then +// * because different NativeCharSize subparts (bytes in case of utf-8) from different +// MultiNativeCharSize trim chars when clubbed together can map to some other new char +// in a variable length encoded char space, if there is that new char at either end +// of the string, it may get trimmed, because of the possibility of mix up mentioned. +// * given that different variable length MultiNativeCharSize encoded chars may have +// some common NativeCharSize subparts (bytes in case of utf-8) between them, if one +// of these chars is at either end of the string and another char is in trimChars, +// then string may get partially trimmed wrt such a char at either end. +// +template +inline TString str_trim_dumb(TString sin, const TString &trimChars=" \t\n") { +#ifdef DUS_DEBUG_VERBOSE + dumphex_string(sin, "DBUG:StrTrimDumb:Str:"); + dumphex_string(trimChars, "DBUG:StrTrimDumb:TrimChars:"); +#endif + sin.erase(sin.find_last_not_of(trimChars)+1); + sin.erase(0, sin.find_first_not_of(trimChars)); + return sin; +} + +// Remove chars from begin and end of the passed string, provided the char belongs +// to one of the chars in trimChars. +// NOTE: Internally converts to wchar/wstring to try and support proper trimming, +// wrt possibly more languages, to some extent. IE even if the passed string +// contains multibyte encoded characters in it in utf-8 space (ie MultiNativeCharSize), +// it may get converted to NativeCharSize chars in the expanded wchar_t encoding space, +// thus leading to fixed NativeCharSize driven logic itself handling things sufficiently. +// Look at str_trim_dumb comments for additional aspects. +inline std::string str_trim_oversmart(std::string sIn, const std::string &trimChars=" \t\n") { + std::wstring wIn; + mbs_to_wcs(wIn, sIn); + std::wstring wTrimChars; + mbs_to_wcs(wTrimChars, trimChars); + auto wOut = str_trim_dumb(wIn, wTrimChars); + std::string sOut; + wcs_to_mbs(sOut, wOut); + return sOut; +} + +// Remove atmost 1 char at the begin and 1 char at the end of the passed string, +// provided the char belongs to one of the chars in trimChars. +// +// NOTE: Chars being trimmed (ie in trimChars) needs to be part of NativeCharSize +// subset of the string's encoded char space, to avoid mix up when working with +// strings which can be utf-8/utf-16/utf-32/sane-variable-length encoded strings. +// +// NOTE:UTF8: This will work provided the string being trimmed as well the chars +// being trimmed are made up of 1byte encoded chars in case of utf8 encoding space. +// If the string being trimmed includes multibyte (ie MultiNativeCharSize) encoded +// characters at either end, then trimming can mess things up, if you have multibyte +// encoded utf-8 chars in the trimChars set. +// +// Currently given that SimpCfg only uses this with NativeCharSize chars in the +// trimChars and most of the platforms are likely to be using utf-8 based char +// space (which is a realtively sane variable length char encoding from this +// logics perspective), so not providing oversmart variant. +// +template +inline TString str_trim_single(TString sin, const TString& trimChars=" \t\n") { + if (sin.empty()) return sin; + for(auto c: trimChars) { + if (c == sin.front()) { + sin = sin.substr(1, TString::npos); + break; + } + } + if (sin.empty()) return sin; + for(auto c: trimChars) { + if (c == sin.back()) { + sin = sin.substr(0, sin.length()-1); + break; + } + } + return sin; +} + +// Convert to lower case, if language has upper and lower case semantic +// +// This works for fixed size encoded char spaces. +// +// For variable length encoded char spaces, it can work +// * if one is doing the conversion for languages which fit into NativeCharSized chars in it +// * AND if one is working with a sane variable length encoding standard +// * ex: this will work if trying to do the conversion for english language within utf-8 +// +template +inline TString str_tolower(const TString &sin) { + TString sout; + sout.resize(sin.size()); + std::transform(sin.begin(), sin.end(), sout.begin(), [](auto c)->auto {return std::tolower(c);}); +#ifdef DUS_DEBUG_VERBOSE + dumphex_string(sin, "DBUG:StrToLower:in:"); + dumphex_string(sout, "DBUG:StrToLower:out:"); +#endif + return sout; +} + +inline void str_compare_dump(const std::string &s1, const std::string &s2) { + LDBUG_LN("DBUG:%s:%s:Len:%zu", __func__, s1.c_str(), s1.length()); + LDBUG_LN("DBUG:%s:%s:Len:%zu", __func__, s2.c_str(), s2.length()); + int minLen = s1.length() < s2.length() ? s1.length() : s2.length(); + for(int i=0; i +std::string str(TypeWithStrSupp value) { + std::stringstream ss; + ss << value; + return ss.str(); +} diff --git a/common/simpcfg.hpp b/common/simpcfg.hpp index dfaf16c07..ffcadc3f4 100644 --- a/common/simpcfg.hpp +++ b/common/simpcfg.hpp @@ -4,6 +4,8 @@ * Provides a simple direct 1-level only config file logic * by Humans for All * + * This builds on the GroupKV class. + * * ## File format * * It can consist of multiple config groups. @@ -24,24 +26,6 @@ * It tries to provide a crude expanded form of array wrt any of the above supported types. * For this one needs to define keys using the pattern TheKeyName-0, TheKeyName-1, .... * - * ## Additional notes - * - * NativeCharSize encoded char refers to chars which fit within the size of char type in a given - * type of c++ string or base bitsize of a encoding standard, like 1 byte in case of std::string, - * utf-8, ... - * * example english alphabets in utf-8 encoding space are 1byte chars, in its variable length - * encoding space. - * - * MultiNativeCharSize encoded char refers to chars which occupy multiple base-char-bit-size of - * a c++ string type or char encoding standard. - * * example indian scripts alphabets in utf-8 encoding space occupy multiple bytes in its variable - * length encoding space. - * - * Sane variable length encoding - refers to encoding where the values of NativeCharSized chars of - * a char encoding space cant overlap with values in NativeCharSize subparts of MultiNativeCharSized - * chars of the same char encoding standard. - * * utf-8 shows this behaviour - * * chances are utf-16 and utf-32 also show this behaviour (need to cross check once) */ #include @@ -54,203 +38,9 @@ #include #include "groupkv.hpp" +#include "datautils_string.hpp" -#undef SC_DEBUG_VERBOSE - -#undef SC_STR_OVERSMART -#ifdef SC_STR_OVERSMART -#define str_trim str_trim_oversmart -#else -#define str_trim str_trim_dumb -#endif - - -// **** **** **** String related helpers **** **** **** // - - -inline size_t wcs_to_mbs(std::string &sDest, const std::wstring &wSrc) { - std::mbstate_t mbState = std::mbstate_t(); - const wchar_t *wSrcP = wSrc.c_str(); - auto reqLen = std::wcsrtombs(nullptr, &wSrcP, 0, &mbState); - if (reqLen == static_cast(-1)) { - throw std::runtime_error("ERRR:WCS2MBS:Failed probing of size..."); - } - sDest.resize(reqLen); - return std::wcsrtombs(sDest.data(), &wSrcP, sDest.length(), &mbState); -} - -inline size_t mbs_to_wcs(std::wstring &wDest, const std::string &sSrc) { - std::mbstate_t mbState = std::mbstate_t(); - const char *sSrcP = sSrc.c_str(); - auto reqLen = std::mbsrtowcs(nullptr, &sSrcP, 0, &mbState); - if (reqLen == static_cast(-1)) { - throw std::runtime_error("ERRR:MBS2WCS:Failed probing of size..."); - } - wDest.resize(reqLen); - return std::mbsrtowcs(wDest.data(), &sSrcP, wDest.length(), &mbState); -} - -template -inline void dumphex_string(const TString &sIn, const std::string &msgTag){ - LDBUG("%s[ ", msgTag.c_str()); - for(auto c: sIn) { - auto cSize = sizeof(c); - if (cSize == 1) { - LDBUG("%02x, ", (uint8_t)c); - } else if (cSize == 2) { - LDBUG("%04x, ", (uint16_t)c); - } else if (cSize == 4) { - LDBUG("%08x, ", (uint32_t)c); - } else { - std::stringstream ss; - ss << "ERRR:" << __func__ << ":Unsupported char type with size [" << cSize << "]"; - throw std::runtime_error( ss.str().c_str() ); - } - } - LDBUG_LN(" ]"); -} - -// Remove chars from begin and end of the passed string, provided the char -// belongs to one of the chars in trimChars. -// -// NOTE: This will work perfectly provided the string being trimmed as well as -// chars being trimmed are made up of NativeCharSize chars from same encoded space. -// For utf-8, this means the ascii equivalent 1byteSized chars of utf8 and not -// variable length MultiNativeCharSize (ie multibye in case of utf-8) ones. -// NOTE: It will also work, if atleast either end of string as well as trimChars -// have NativeCharSize chars from their encoding space, rather than variable -// length MultiNativeCharSize based chars if any. There needs to be NativeCharSized -// chars beyond any chars that get trimmed, on either side. -// -// NOTE: Given the way UTF-8 char encoding is designed, where NativeCharSize 1byte -// encoded chars are fully unique and dont overlap with any bytes from any of the -// variable length MultiNativeCharSize encoded chars in the utf-8 space, so as long as -// the trimChars belong to NativeCharSize chars subset, the logic should work, even -// if string has a mixture of NativeCharSize and MultiNativeCharSize encoded chars. -// Chances are utf-16 and utf-32 also have similar characteristics wrt thier -// NativeCharSize encoded chars (ie those fully encoded within single 16bit and 32bit -// value respectively), and so equivalent semantic applies to them also. -// -// ALERT: Given that this simple minded logic, works at individual NativeCharSize level -// only, If trimChars involve variable length MultiNativeCharSize encoded chars, then -// * because different NativeCharSize subparts (bytes in case of utf-8) from different -// MultiNativeCharSize trim chars when clubbed together can map to some other new char -// in a variable length encoded char space, if there is that new char at either end -// of the string, it may get trimmed, because of the possibility of mix up mentioned. -// * given that different variable length MultiNativeCharSize encoded chars may have -// some common NativeCharSize subparts (bytes in case of utf-8) between them, if one -// of these chars is at either end of the string and another char is in trimChars, -// then string may get partially trimmed wrt such a char at either end. -// -template -inline TString str_trim_dumb(TString sin, const TString &trimChars=" \t\n") { -#ifdef SC_DEBUG_VERBOSE - dumphex_string(sin, "DBUG:StrTrimDumb:Str:"); - dumphex_string(trimChars, "DBUG:StrTrimDumb:TrimChars:"); -#endif - sin.erase(sin.find_last_not_of(trimChars)+1); - sin.erase(0, sin.find_first_not_of(trimChars)); - return sin; -} - -// Remove chars from begin and end of the passed string, provided the char belongs -// to one of the chars in trimChars. -// NOTE: Internally converts to wchar/wstring to try and support proper trimming, -// wrt possibly more languages, to some extent. IE even if the passed string -// contains multibyte encoded characters in it in utf-8 space (ie MultiNativeCharSize), -// it may get converted to NativeCharSize chars in the expanded wchar_t encoding space, -// thus leading to fixed NativeCharSize driven logic itself handling things sufficiently. -// Look at str_trim_dumb comments for additional aspects. -inline std::string str_trim_oversmart(std::string sIn, const std::string &trimChars=" \t\n") { - std::wstring wIn; - mbs_to_wcs(wIn, sIn); - std::wstring wTrimChars; - mbs_to_wcs(wTrimChars, trimChars); - auto wOut = str_trim_dumb(wIn, wTrimChars); - std::string sOut; - wcs_to_mbs(sOut, wOut); - return sOut; -} - -// Remove atmost 1 char at the begin and 1 char at the end of the passed string, -// provided the char belongs to one of the chars in trimChars. -// -// NOTE: Chars being trimmed (ie in trimChars) needs to be part of NativeCharSize -// subset of the string's encoded char space, to avoid mix up when working with -// strings which can be utf-8/utf-16/utf-32/sane-variable-length encoded strings. -// -// NOTE:UTF8: This will work provided the string being trimmed as well the chars -// being trimmed are made up of 1byte encoded chars in case of utf8 encoding space. -// If the string being trimmed includes multibyte (ie MultiNativeCharSize) encoded -// characters at either end, then trimming can mess things up, if you have multibyte -// encoded utf-8 chars in the trimChars set. -// -// Currently given that SimpCfg only uses this with NativeCharSize chars in the -// trimChars and most of the platforms are likely to be using utf-8 based char -// space (which is a realtively sane variable length char encoding from this -// logics perspective), so not providing oversmart variant. -// -template -inline TString str_trim_single(TString sin, const TString& trimChars=" \t\n") { - if (sin.empty()) return sin; - for(auto c: trimChars) { - if (c == sin.front()) { - sin = sin.substr(1, TString::npos); - break; - } - } - if (sin.empty()) return sin; - for(auto c: trimChars) { - if (c == sin.back()) { - sin = sin.substr(0, sin.length()-1); - break; - } - } - return sin; -} - -// Convert to lower case, if language has upper and lower case semantic -// -// This works for fixed size encoded char spaces. -// -// For variable length encoded char spaces, it can work -// * if one is doing the conversion for languages which fit into NativeCharSized chars in it -// * AND if one is working with a sane variable length encoding standard -// * ex: this will work if trying to do the conversion for english language within utf-8 -// -template -inline TString str_tolower(const TString &sin) { - TString sout; - sout.resize(sin.size()); - std::transform(sin.begin(), sin.end(), sout.begin(), [](auto c)->auto {return std::tolower(c);}); -#ifdef SC_DEBUG_VERBOSE - dumphex_string(sin, "DBUG:StrToLower:in:"); - dumphex_string(sout, "DBUG:StrToLower:out:"); -#endif - return sout; -} - -inline void str_compare_dump(const std::string &s1, const std::string &s2) { - LDBUG_LN("DBUG:%s:%s:Len:%zu", __func__, s1.c_str(), s1.length()); - LDBUG_LN("DBUG:%s:%s:Len:%zu", __func__, s2.c_str(), s2.length()); - int minLen = s1.length() < s2.length() ? s1.length() : s2.length(); - for(int i=0; i -std::string str(TypeWithStrSupp value) { - std::stringstream ss; - ss << value; - return ss.str(); -} - - -// **** **** **** the SimpCfg **** **** **** // - class SimpCfg : public GroupKV {