Files
KirkOS/user/include/mlibc/options/internal/generic/charcode.cpp
T
kaguya 9a9b91c940 user: implement mlibc as the libc, finally.
It's finally done..

Signed-off-by: kaguya <vpshinomiya@protonmail.com>
2026-05-02 03:31:49 -04:00

269 lines
7.3 KiB
C++

#include <bits/ensure.h>
#include <frg/string.hpp>
#include <mlibc/charcode.hpp>
#include <mlibc/debug.hpp>
namespace mlibc {
struct utf8_charcode {
static constexpr bool preserves_7bit_units = true;
static constexpr bool has_shift_states = false;
struct decode_state {
decode_state()
: _progress{0}, _cpoint{0} { }
auto progress() { return _progress; }
auto cpoint() { return _cpoint; }
charcode_error operator() (code_seq<const char> &seq) {
auto uc = static_cast<unsigned char>(*seq.it);
if(!_progress) {
if(!(uc & 0b1000'0000)) {
// ASCII-compatible.
_cpoint = uc;
}else if((uc & 0b1110'0000) == 0b1100'0000) {
_cpoint = uc & 0b1'1111;
_progress = 1;
}else if((uc & 0b1111'0000) == 0b1110'0000) {
_cpoint = uc & 0b1111;
_progress = 2;
}else if((uc & 0b1111'1000) == 0b1111'0000) {
_cpoint = uc & 0b111;
_progress = 3;
}else{
// If the highest two bits are 0b10, this is the second (or later) unit.
// Units with highest five bits = 0b11111 do not occur in valid UTF-8.
__ensure((uc & 0b1100'0000) == 0b1000'0000
|| (uc & 0b1111'1000) == 0b1111'1000);
return charcode_error::illegal_input;
}
}else{
// TODO: Return an error.
__ensure((uc & 0b1100'0000) == 0b1000'0000);
_cpoint = (_cpoint << 6) | (uc & 0x3F);
--_progress;
}
++seq.it;
return charcode_error::null;
}
private:
int _progress;
codepoint _cpoint;
};
#define NSEQ_STORE(VAL) do { \
if (!static_cast<bool>(nseq)) { \
return charcode_error::output_overflow; \
} \
*nseq.it = (VAL); \
++nseq.it; \
} while (0)
struct encode_state {
// Encodes a single character from wseq + the current state and stores it in nseq.
// TODO: Convert decode_state to the same strategy.
charcode_error operator() (code_seq<char> &nseq, code_seq<const codepoint> &wseq) {
auto wc = *wseq.it;
if (wc <= 0x7F) {
NSEQ_STORE(wc);
} else if (wc <= 0x7FF) {
NSEQ_STORE(0xC0 | (wc >> 6));
NSEQ_STORE(0x80 | (wc & 0x3f));
} else if (wc <= 0xFFFF) {
NSEQ_STORE(0xE0 | (wc >> 12));
NSEQ_STORE(0x80 | ((wc >> 6) & 0x3f));
NSEQ_STORE(0x80 | (wc & 0x3f));
} else if (wc <= 0x10FFFF) {
NSEQ_STORE(0xF0 | (wc >> 18));
NSEQ_STORE(0x80 | ((wc >> 12) & 0x3f));
NSEQ_STORE(0x80 | ((wc >> 6) & 0x3f));
NSEQ_STORE(0x80 | (wc & 0x3f));
} else {
return charcode_error::illegal_input;
}
++wseq.it;
return charcode_error::null;
}
};
#undef NSEQ_STORE
};
polymorphic_charcode::~polymorphic_charcode() = default;
// For *decoding, this class assumes that:
// - G::decode_state has members progress() and cpoint().
// - G::decode_state::progress() >= 0 at all times.
// TODO: This will be needed on platforms like Windows, where wchar_t is UTF-16.
// TODO: There, we can use negative __mlibc_mbstate::progress to represent encoding to UTF-16.
// - If G::decode_state::progress() == 0, the code point (given by cpoint())
// was decoded successfully.
template<typename G>
struct polymorphic_charcode_adapter : polymorphic_charcode {
polymorphic_charcode_adapter()
: polymorphic_charcode{G::preserves_7bit_units, G::has_shift_states} { }
charcode_error decode(code_seq<const char> &nseq, code_seq<codepoint> &wseq,
__mlibc_mbstate &st) override {
__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
code_seq<const char> decode_nseq = nseq;
typename G::decode_state ds;
while(decode_nseq && wseq) {
// Consume the next code unit.
if(auto e = ds(decode_nseq); e != charcode_error::null)
return e;
// Produce a new code point.
if(!ds.progress()) {
// "Commit" consumed code units (as there was no decode error).
nseq.it = decode_nseq.it;
if(!ds.cpoint()) // Stop on null characters.
return charcode_error::null;
*wseq.it = ds.cpoint();
++wseq.it;
}
}
if(ds.progress())
return charcode_error::input_underflow;
return charcode_error::null;
}
charcode_error decode_wtranscode(code_seq<const char> &nseq, code_seq<wchar_t> &wseq,
__mlibc_mbstate &st) override {
__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
code_seq<const char> decode_nseq = nseq;
typename G::decode_state ds;
while(decode_nseq && wseq) {
// Consume the next code unit.
if(auto e = ds(decode_nseq); e != charcode_error::null)
return e;
// Produce a new code point.
if(!ds.progress()) {
nseq.it = decode_nseq.it;
// "Commit" consumed code units (as there was no decode error).
if(!ds.cpoint()) // Stop on null characters.
return charcode_error::null;
*wseq.it = ds.cpoint();
++wseq.it;
}
}
if(ds.progress())
return charcode_error::input_underflow;
return charcode_error::null;
}
charcode_error decode_wtranscode_length(code_seq<const char> &nseq, size_t *n,
__mlibc_mbstate &st) override {
__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().
code_seq<const char> decode_nseq = nseq;
typename G::decode_state ds;
*n = 0;
while(decode_nseq) {
// Consume the next code unit.
if(auto e = ds(decode_nseq); e != charcode_error::null)
return e;
if(!ds.progress()) {
nseq.it = decode_nseq.it;
// "Commit" consumed code units (as there was no decode error).
if(!ds.cpoint()) // Stop on null code points.
return charcode_error::null;
++(*n);
}
}
if(ds.progress())
return charcode_error::input_underflow;
return charcode_error::null;
}
charcode_error encode_wtranscode(code_seq<char> &nseq, code_seq<const wchar_t> &wseq,
__mlibc_mbstate &st) override {
__ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().
code_seq<char> encode_nseq = nseq;
typename G::encode_state es;
while(encode_nseq && wseq) {
codepoint cp = *wseq.it;
if(!cp)
return charcode_error::null;
code_seq<const codepoint> cps{&cp, &cp + 1};
if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
continue;
}else if(e != charcode_error::null) {
return e;
}
__ensure(cps.it == cps.end);
++wseq.it;
// "Commit" produced code units (as there was no encode error).
nseq.it = encode_nseq.it;
}
if(encode_nseq.it != nseq.it)
return charcode_error::output_overflow;
return charcode_error::null;
}
charcode_error encode_wtranscode_length(code_seq<const wchar_t> &wseq, size_t *n,
__mlibc_mbstate &st) override {
__ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().
typename G::encode_state es;
*n = 0;
while(wseq) {
char temp[4];
code_seq<char> encode_nseq{temp, temp + 4};
codepoint cp = *wseq.it;
if(!cp)
return charcode_error::null;
// Consume the next code unit.
code_seq<const codepoint> cps{&cp, &cp + 1};
if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
continue;
}else if(e != charcode_error::null) {
return e;
}
++(*n);
++wseq.it;
}
return charcode_error::null;
}
};
polymorphic_charcode *current_charcode() {
static polymorphic_charcode_adapter<utf8_charcode> global_charcode;
return &global_charcode;
}
charcode_error wide_charcode::promote(wchar_t nc, codepoint &wc) {
// TODO: Allow non-identity encodings of wchar_t.
wc = nc;
return charcode_error::null;
}
wide_charcode *platform_wide_charcode() {
static wide_charcode global_wide_charcode;
return &global_wide_charcode;
}
} // namespace mlibc