diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py
index d774fcabe..1528a13db 100644
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -85,7 +85,6 @@ UNICODE_CATEGORY_TO_INDEX = {
 
 
 codepoint_categs = array.array('B', [0]) * MAX_CODEPOINTS  # Undefined
-table_whitespace = []
 table_lowercase = []
 table_uppercase = []
 table_nfd = []
@@ -111,19 +110,20 @@ for (cpt, cpt_lower, cpt_upper, categ, bidir) in unicode_data_iter():
         table_nfd.append((cpt, norm))
 
 
-# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
-table_whitespace.extend(range(0x0009, 0x000D + 1))
-table_whitespace.extend(range(0x2000, 0x200A + 1))
-table_whitespace.extend([0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000])
-
-
 # sort by codepoint
-table_whitespace.sort()
 table_lowercase.sort()
 table_uppercase.sort()
 table_nfd.sort()
 
 
+# whitespaces, see "<White_Space>" https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+whitespace_ranges: list[tuple[int, int]] = []  # start, last
+whitespace_ranges.append((0x0009, 0x000D))
+whitespace_ranges.append((0x2000, 0x200A))
+for whitespace in [0x0020, 0x0085, 0x00A0, 0x1680, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000]:
+    whitespace_ranges.append((whitespace, whitespace))
+
+
 # run length encoding, see unicode_cpt_category() in unicode.cpp
 assert (max(UNICODE_CATEGORY_TO_INDEX.values()) < 32)
 codepoint_categs_runs = [codepoint_categs[0]]  # 5 bits categ + 11 bits length
@@ -162,7 +162,6 @@ out("""\
 #include <cstdint>
 #include <vector>
 #include <unordered_map>
-#include <unordered_set>
 """)
 
 out("const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length encoding, 5 bits categ + 11 bits length")
@@ -170,9 +169,9 @@ for rle in codepoint_categs_runs:
     out("0x%04X," % rle)
 out("};\n")
 
-out("const std::vector<uint32_t> unicode_vec_whitespace = {")
-for codepoint in table_whitespace:
-    out("0x%06X," % codepoint)
+out("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {")
+for (start, last) in whitespace_ranges:
+    out("{0x%06X, 0x%06X}," % (start, last))
 out("};\n")
 
 out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp
index 2591723ce..1a2ceb017 100644
--- a/src/unicode-data.cpp
+++ b/src/unicode-data.cpp
@@ -4526,32 +4526,18 @@ const std::vector<uint16_t> unicode_rle_codepoints_categs = {  // run length enc
 0x0020,
 };
 
-const std::vector<uint32_t> unicode_vec_whitespace = {
-0x000009,
-0x00000A,
-0x00000B,
-0x00000C,
-0x00000D,
-0x000020,
-0x000085,
-0x0000A0,
-0x001680,
-0x002000,
-0x002001,
-0x002002,
-0x002003,
-0x002004,
-0x002005,
-0x002006,
-0x002007,
-0x002008,
-0x002009,
-0x00200A,
-0x002028,
-0x002029,
-0x00202F,
-0x00205F,
-0x003000,
+const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace = {
+{0x000009, 0x00000D},
+{0x002000, 0x00200A},
+{0x000020, 0x000020},
+{0x000085, 0x000085},
+{0x0000A0, 0x0000A0},
+{0x001680, 0x001680},
+{0x002028, 0x002028},
+{0x002029, 0x002029},
+{0x00202F, 0x00202F},
+{0x00205F, 0x00205F},
+{0x003000, 0x003000},
 };
 
 const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
diff --git a/src/unicode-data.h b/src/unicode-data.h
index 682f79c37..447826879 100644
--- a/src/unicode-data.h
+++ b/src/unicode-data.h
@@ -13,7 +13,7 @@ struct range_nfd {
 static const uint32_t MAX_CODEPOINTS = 0x110000;
 
 extern const std::vector<uint16_t> unicode_rle_codepoints_categs;
-extern const std::vector<uint32_t> unicode_vec_whitespace;
+extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
 extern const std::vector<range_nfd> unicode_ranges_nfd;
diff --git a/src/unicode.cpp b/src/unicode.cpp
index 725476600..6ebef0ec9 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -597,8 +597,10 @@ codepoint_categ unicode_cpt_category(const uint32_t cp) {
         }
         GGML_ASSERT(cpt == MAX_CODEPOINTS);
 
-        for (auto cpt : unicode_vec_whitespace) {
-            cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+        for (auto p : unicode_ranges_whitespace) {
+            for (uint32_t cpt = p.first; cpt <= p.second; ++cpt) {
+                cpt_categs[cpt].set_flag(codepoint_categ::WHITESPACE);
+            }
         }
 
         for (auto p : unicode_map_lowercase) {