mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-06-03 19:22:27 +00:00
Release Cosmopolitan v3.6.0
This release is an atomic upgrade to GCC 14.1.0 with C23 and C++23
This commit is contained in:
parent
62ace3623a
commit
5660ec4741
1585 changed files with 117353 additions and 271644 deletions
357
third_party/libcxx/__format/unicode.h
vendored
357
third_party/libcxx/__format/unicode.h
vendored
|
@ -15,9 +15,9 @@
|
|||
#include <__concepts/same_as.h>
|
||||
#include <__config>
|
||||
#include <__format/extended_grapheme_cluster_table.h>
|
||||
#include <__format/indic_conjunct_break_table.h>
|
||||
#include <__iterator/concepts.h>
|
||||
#include <__iterator/readable_traits.h> // iter_value_t
|
||||
#include <__type_traits/make_unsigned.h>
|
||||
#include <__utility/unreachable.h>
|
||||
#include <string_view>
|
||||
|
||||
|
@ -105,7 +105,7 @@ template <contiguous_iterator _Iterator>
|
|||
requires same_as<iter_value_t<_Iterator>, char>
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
|
||||
do {
|
||||
if ((*__char & 0b1000'0000) != 0b1000'0000)
|
||||
if ((*__char & 0b1100'0000) != 0b1000'0000)
|
||||
return false;
|
||||
--__count;
|
||||
++__char;
|
||||
|
@ -155,7 +155,7 @@ public:
|
|||
// - The parser always needs to consume these code units
|
||||
// - The code is optimized for well-formed UTF-8
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
|
||||
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
|
||||
_LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
|
||||
|
||||
// Based on the number of leading 1 bits the number of code units in the
|
||||
// code point can be determined. See
|
||||
|
@ -261,7 +261,7 @@ public:
|
|||
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
|
||||
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
|
||||
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
|
||||
_LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
|
||||
|
||||
char32_t __value = static_cast<char32_t>(*__first_++);
|
||||
if constexpr (sizeof(wchar_t) == 2) {
|
||||
|
@ -294,85 +294,231 @@ private:
|
|||
};
|
||||
# endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
|
||||
bool& __ri_break_allowed,
|
||||
bool __has_extened_pictographic,
|
||||
__extended_grapheme_custer_property_boundary::__property __prev,
|
||||
__extended_grapheme_custer_property_boundary::__property __next) {
|
||||
using __extended_grapheme_custer_property_boundary::__property;
|
||||
// State machine to implement the Extended Grapheme Cluster Boundary
|
||||
//
|
||||
// The exact rules may change between Unicode versions.
|
||||
// This implements the extended rules see
|
||||
// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
||||
class __extended_grapheme_cluster_break {
|
||||
using __EGC_property = __extended_grapheme_custer_property_boundary::__property;
|
||||
using __inCB_property = __indic_conjunct_break::__property;
|
||||
|
||||
__has_extened_pictographic |= __prev == __property::__Extended_Pictographic;
|
||||
|
||||
// https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
|
||||
|
||||
// *** Break at the start and end of text, unless the text is empty. ***
|
||||
|
||||
_LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
|
||||
_LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2
|
||||
|
||||
// *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
|
||||
if (__prev == __property::__CR && __next == __property::__LF) // GB3
|
||||
return false;
|
||||
|
||||
if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4
|
||||
return true;
|
||||
|
||||
if (__next == __property::__Control || __next == __property::__CR || __next == __property::__LF) // GB5
|
||||
return true;
|
||||
|
||||
// *** Do not break Hangul syllable sequences. ***
|
||||
if (__prev == __property::__L &&
|
||||
(__next == __property::__L || __next == __property::__V || __next == __property::__LV ||
|
||||
__next == __property::__LVT)) // GB6
|
||||
return false;
|
||||
|
||||
if ((__prev == __property::__LV || __prev == __property::__V) &&
|
||||
(__next == __property::__V || __next == __property::__T)) // GB7
|
||||
return false;
|
||||
|
||||
if ((__prev == __property::__LVT || __prev == __property::__T) && __next == __property::__T) // GB8
|
||||
return false;
|
||||
|
||||
// *** Do not break before extending characters or ZWJ. ***
|
||||
if (__next == __property::__Extend || __next == __property::__ZWJ)
|
||||
return false; // GB9
|
||||
|
||||
// *** Do not break before SpacingMarks, or after Prepend characters. ***
|
||||
if (__next == __property::__SpacingMark) // GB9a
|
||||
return false;
|
||||
|
||||
if (__prev == __property::__Prepend) // GB9b
|
||||
return false;
|
||||
|
||||
// *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
|
||||
|
||||
// GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
|
||||
//
|
||||
// Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ)
|
||||
// - \p{Extended_Pictographic} x Extend
|
||||
// - Extend x Extend
|
||||
// - \p{Extended_Pictographic} x ZWJ
|
||||
// - Extend x ZWJ
|
||||
//
|
||||
// So the only case left to test is
|
||||
// - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
|
||||
// where \p{Extended_Pictographic}' is stored in __has_extened_pictographic
|
||||
if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic)
|
||||
return false;
|
||||
|
||||
// *** Do not break within emoji flag sequences ***
|
||||
|
||||
// That is, do not break between regional indicator (RI) symbols if there
|
||||
// is an odd number of RI characters before the break point.
|
||||
|
||||
if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13
|
||||
__ri_break_allowed = !__ri_break_allowed;
|
||||
return __ri_break_allowed;
|
||||
public:
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_break(char32_t __first_code_point)
|
||||
: __prev_code_point_(__first_code_point),
|
||||
__prev_property_(__extended_grapheme_custer_property_boundary::__get_property(__first_code_point)) {
|
||||
// Initializes the active rule.
|
||||
if (__prev_property_ == __EGC_property::__Extended_Pictographic)
|
||||
__active_rule_ = __rule::__GB11_emoji;
|
||||
else if (__prev_property_ == __EGC_property::__Regional_Indicator)
|
||||
__active_rule_ = __rule::__GB12_GB13_regional_indicator;
|
||||
else if (__indic_conjunct_break::__get_property(__first_code_point) == __inCB_property::__Consonant)
|
||||
__active_rule_ = __rule::__GB9c_indic_conjunct_break;
|
||||
}
|
||||
|
||||
// *** Otherwise, break everywhere. ***
|
||||
return true; // GB999
|
||||
}
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool operator()(char32_t __next_code_point) {
|
||||
__EGC_property __next_property = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point);
|
||||
bool __result = __evaluate(__next_code_point, __next_property);
|
||||
__prev_code_point_ = __next_code_point;
|
||||
__prev_property_ = __next_property;
|
||||
return __result;
|
||||
}
|
||||
|
||||
// The code point whose break propery are considered during the next
|
||||
// evaluation cyle.
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr char32_t __current_code_point() const { return __prev_code_point_; }
|
||||
|
||||
private:
|
||||
// The naming of the identifiers matches the Unicode standard.
|
||||
// NOLINTBEGIN(readability-identifier-naming)
|
||||
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
|
||||
__evaluate(char32_t __next_code_point, __EGC_property __next_property) {
|
||||
switch (__active_rule_) {
|
||||
case __rule::__none:
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
case __rule::__GB9c_indic_conjunct_break:
|
||||
return __evaluate_GB9c_indic_conjunct_break(__next_code_point, __next_property);
|
||||
case __rule::__GB11_emoji:
|
||||
return __evaluate_GB11_emoji(__next_code_point, __next_property);
|
||||
case __rule::__GB12_GB13_regional_indicator:
|
||||
return __evaluate_GB12_GB13_regional_indicator(__next_code_point, __next_property);
|
||||
}
|
||||
__libcpp_unreachable();
|
||||
}
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr bool __evaluate_none(char32_t __next_code_point, __EGC_property __next_property) {
|
||||
// *** Break at the start and end of text, unless the text is empty. ***
|
||||
|
||||
_LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__sot, "should be handled in the constructor"); // GB1
|
||||
_LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__eot, "should be handled by our caller"); // GB2
|
||||
|
||||
// *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
|
||||
if (__prev_property_ == __EGC_property::__CR && __next_property == __EGC_property::__LF) // GB3
|
||||
return false;
|
||||
|
||||
if (__prev_property_ == __EGC_property::__Control || __prev_property_ == __EGC_property::__CR ||
|
||||
__prev_property_ == __EGC_property::__LF) // GB4
|
||||
return true;
|
||||
|
||||
if (__next_property == __EGC_property::__Control || __next_property == __EGC_property::__CR ||
|
||||
__next_property == __EGC_property::__LF) // GB5
|
||||
return true;
|
||||
|
||||
// *** Do not break Hangul syllable sequences. ***
|
||||
if (__prev_property_ == __EGC_property::__L &&
|
||||
(__next_property == __EGC_property::__L || __next_property == __EGC_property::__V ||
|
||||
__next_property == __EGC_property::__LV || __next_property == __EGC_property::__LVT)) // GB6
|
||||
return false;
|
||||
|
||||
if ((__prev_property_ == __EGC_property::__LV || __prev_property_ == __EGC_property::__V) &&
|
||||
(__next_property == __EGC_property::__V || __next_property == __EGC_property::__T)) // GB7
|
||||
return false;
|
||||
|
||||
if ((__prev_property_ == __EGC_property::__LVT || __prev_property_ == __EGC_property::__T) &&
|
||||
__next_property == __EGC_property::__T) // GB8
|
||||
return false;
|
||||
|
||||
// *** Do not break before extending characters or ZWJ. ***
|
||||
if (__next_property == __EGC_property::__Extend || __next_property == __EGC_property::__ZWJ)
|
||||
return false; // GB9
|
||||
|
||||
// *** Do not break before SpacingMarks, or after Prepend characters. ***
|
||||
if (__next_property == __EGC_property::__SpacingMark) // GB9a
|
||||
return false;
|
||||
|
||||
if (__prev_property_ == __EGC_property::__Prepend) // GB9b
|
||||
return false;
|
||||
|
||||
// *** Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker. ***
|
||||
if (__indic_conjunct_break::__get_property(__next_code_point) == __inCB_property::__Consonant) {
|
||||
__active_rule_ = __rule::__GB9c_indic_conjunct_break;
|
||||
__GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
|
||||
return true;
|
||||
}
|
||||
|
||||
// *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
|
||||
if (__next_property == __EGC_property::__Extended_Pictographic) {
|
||||
__active_rule_ = __rule::__GB11_emoji;
|
||||
__GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
|
||||
return true;
|
||||
}
|
||||
|
||||
// *** Do not break within emoji flag sequences ***
|
||||
|
||||
// That is, do not break between regional indicator (RI) symbols if there
|
||||
// is an odd number of RI characters before the break point.
|
||||
if (__next_property == __EGC_property::__Regional_Indicator) { // GB12 + GB13
|
||||
__active_rule_ = __rule::__GB12_GB13_regional_indicator;
|
||||
return true;
|
||||
}
|
||||
|
||||
// *** Otherwise, break everywhere. ***
|
||||
return true; // GB999
|
||||
}
|
||||
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
|
||||
__evaluate_GB9c_indic_conjunct_break(char32_t __next_code_point, __EGC_property __next_property) {
|
||||
__inCB_property __break = __indic_conjunct_break::__get_property(__next_code_point);
|
||||
if (__break == __inCB_property::__none) {
|
||||
__active_rule_ = __rule::__none;
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
}
|
||||
|
||||
switch (__GB9c_indic_conjunct_break_state_) {
|
||||
case __GB9c_indic_conjunct_break_state::__Consonant:
|
||||
if (__break == __inCB_property::__Extend) {
|
||||
return false;
|
||||
}
|
||||
if (__break == __inCB_property::__Linker) {
|
||||
__GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Linker;
|
||||
return false;
|
||||
}
|
||||
__active_rule_ = __rule::__none;
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
|
||||
case __GB9c_indic_conjunct_break_state::__Linker:
|
||||
if (__break == __inCB_property::__Extend) {
|
||||
return false;
|
||||
}
|
||||
if (__break == __inCB_property::__Linker) {
|
||||
return false;
|
||||
}
|
||||
if (__break == __inCB_property::__Consonant) {
|
||||
__GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
|
||||
return false;
|
||||
}
|
||||
__active_rule_ = __rule::__none;
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
}
|
||||
__libcpp_unreachable();
|
||||
}
|
||||
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
|
||||
__evaluate_GB11_emoji(char32_t __next_code_point, __EGC_property __next_property) {
|
||||
switch (__GB11_emoji_state_) {
|
||||
case __GB11_emoji_state::__Extended_Pictographic:
|
||||
if (__next_property == __EGC_property::__Extend) {
|
||||
__GB11_emoji_state_ = __GB11_emoji_state::__Extend;
|
||||
return false;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case __GB11_emoji_state::__Extend:
|
||||
if (__next_property == __EGC_property::__ZWJ) {
|
||||
__GB11_emoji_state_ = __GB11_emoji_state::__ZWJ;
|
||||
return false;
|
||||
}
|
||||
if (__next_property == __EGC_property::__Extend)
|
||||
return false;
|
||||
__active_rule_ = __rule::__none;
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
|
||||
case __GB11_emoji_state::__ZWJ:
|
||||
if (__next_property == __EGC_property::__Extended_Pictographic) {
|
||||
__GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
|
||||
return false;
|
||||
}
|
||||
__active_rule_ = __rule::__none;
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
}
|
||||
__libcpp_unreachable();
|
||||
}
|
||||
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
|
||||
__evaluate_GB12_GB13_regional_indicator(char32_t __next_code_point, __EGC_property __next_property) {
|
||||
__active_rule_ = __rule::__none;
|
||||
if (__next_property == __EGC_property::__Regional_Indicator)
|
||||
return false;
|
||||
return __evaluate_none(__next_code_point, __next_property);
|
||||
}
|
||||
|
||||
char32_t __prev_code_point_;
|
||||
__EGC_property __prev_property_;
|
||||
|
||||
enum class __rule {
|
||||
__none,
|
||||
__GB9c_indic_conjunct_break,
|
||||
__GB11_emoji,
|
||||
__GB12_GB13_regional_indicator,
|
||||
};
|
||||
__rule __active_rule_ = __rule::__none;
|
||||
|
||||
enum class __GB11_emoji_state {
|
||||
__Extended_Pictographic,
|
||||
__Extend,
|
||||
__ZWJ,
|
||||
};
|
||||
__GB11_emoji_state __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
|
||||
|
||||
enum class __GB9c_indic_conjunct_break_state {
|
||||
__Consonant,
|
||||
__Linker,
|
||||
};
|
||||
|
||||
__GB9c_indic_conjunct_break_state __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
|
||||
|
||||
// NOLINTEND(readability-identifier-naming)
|
||||
};
|
||||
|
||||
/// Helper class to extract an extended grapheme cluster from a Unicode character range.
|
||||
///
|
||||
|
@ -385,9 +531,7 @@ class __extended_grapheme_cluster_view {
|
|||
|
||||
public:
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
|
||||
: __code_point_view_(__first, __last),
|
||||
__next_code_point_(__code_point_view_.__consume().__code_point),
|
||||
__next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
|
||||
: __code_point_view_(__first, __last), __at_break_(__code_point_view_.__consume().__code_point) {}
|
||||
|
||||
struct __cluster {
|
||||
/// The first code point of the extended grapheme cluster.
|
||||
|
@ -403,45 +547,20 @@ public:
|
|||
_Iterator __last_;
|
||||
};
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
|
||||
_LIBCPP_ASSERT(
|
||||
__next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
|
||||
"can't move beyond the end of input");
|
||||
|
||||
char32_t __code_point = __next_code_point_;
|
||||
if (!__code_point_view_.__at_end())
|
||||
return {__code_point, __get_break()};
|
||||
|
||||
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
|
||||
return {__code_point, __code_point_view_.__position()};
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
|
||||
char32_t __code_point = __at_break_.__current_code_point();
|
||||
_Iterator __position = __code_point_view_.__position();
|
||||
while (!__code_point_view_.__at_end()) {
|
||||
if (__at_break_(__code_point_view_.__consume().__code_point))
|
||||
break;
|
||||
__position = __code_point_view_.__position();
|
||||
}
|
||||
return {__code_point, __position};
|
||||
}
|
||||
|
||||
private:
|
||||
__code_point_view<_CharT> __code_point_view_;
|
||||
|
||||
char32_t __next_code_point_;
|
||||
__extended_grapheme_custer_property_boundary::__property __next_prop_;
|
||||
|
||||
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __get_break() {
|
||||
bool __ri_break_allowed = true;
|
||||
bool __has_extened_pictographic = false;
|
||||
while (true) {
|
||||
_Iterator __result = __code_point_view_.__position();
|
||||
__extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
|
||||
if (__code_point_view_.__at_end()) {
|
||||
__next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
|
||||
return __result;
|
||||
}
|
||||
__next_code_point_ = __code_point_view_.__consume().__code_point;
|
||||
__next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
|
||||
|
||||
__has_extened_pictographic |=
|
||||
__prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
|
||||
|
||||
if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
|
||||
return __result;
|
||||
}
|
||||
}
|
||||
__extended_grapheme_cluster_break __at_break_;
|
||||
};
|
||||
|
||||
template <contiguous_iterator _Iterator>
|
||||
|
@ -463,7 +582,7 @@ public:
|
|||
_LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
|
||||
|
||||
[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
|
||||
_LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
|
||||
_LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
|
||||
return {static_cast<char32_t>(*__first_++)};
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue