KirkOS/user/include/mlibc/options/internal/generic/charcode.cpp


#include <bits/ensure.h>
#include <frg/string.hpp>
#include <mlibc/charcode.hpp>
#include <mlibc/debug.hpp>

namespace mlibc {

struct utf8_charcode {
	static constexpr bool preserves_7bit_units = true;
	static constexpr bool has_shift_states = false;

	struct decode_state {
		decode_state()
		: _progress{0}, _cpoint{0} { }

		auto progress() { return _progress; }
		auto cpoint() { return _cpoint; }

		charcode_error operator() (code_seq<const char> &seq) {
			auto uc = static_cast<unsigned char>(*seq.it);
			if(!_progress) {
				if(!(uc & 0b1000'0000)) {
					// ASCII-compatible.
					_cpoint = uc;
				}else if((uc & 0b1110'0000) == 0b1100'0000) {
					_cpoint = uc & 0b1'1111;
					_progress = 1;
				}else if((uc & 0b1111'0000) == 0b1110'0000) {
					_cpoint = uc & 0b1111;
					_progress = 2;
				}else if((uc & 0b1111'1000) == 0b1111'0000) {
					_cpoint = uc & 0b111;
					_progress = 3;
				}else{
					// If the highest two bits are 0b10, this is the second (or later) unit.
					// Units with highest five bits = 0b11111 do not occur in valid UTF-8.
					__ensure((uc & 0b1100'0000) == 0b1000'0000
							|| (uc & 0b1111'1000) == 0b1111'1000);
					return charcode_error::illegal_input;
				}
			}else{
				// TODO: Return an error.
				__ensure((uc & 0b1100'0000) == 0b1000'0000);
				_cpoint = (_cpoint << 6) | (uc & 0x3F);
				--_progress;
			}
			++seq.it;
			return charcode_error::null;
		}

	private:
		int _progress;
		codepoint _cpoint;
	};

#define NSEQ_STORE(VAL) do { \
	if (!static_cast<bool>(nseq)) { \
		return charcode_error::output_overflow; \
	} \
	*nseq.it = (VAL); \
	++nseq.it; \
} while (0)

	struct encode_state {
		// Encodes a single character from wseq + the current state and stores it in nseq.
		// TODO: Convert decode_state to the same strategy.
		charcode_error operator() (code_seq<char> &nseq, code_seq<const codepoint> &wseq) {
			auto wc = *wseq.it;
			if (wc <= 0x7F) {
				NSEQ_STORE(wc);
			} else if (wc <= 0x7FF) {
				NSEQ_STORE(0xC0 | (wc >> 6));
				NSEQ_STORE(0x80 | (wc & 0x3f));
			} else if (wc <= 0xFFFF) {
				NSEQ_STORE(0xE0 | (wc >> 12));
				NSEQ_STORE(0x80 | ((wc >> 6) & 0x3f));
				NSEQ_STORE(0x80 | (wc & 0x3f));
			} else if (wc <= 0x10FFFF) {
				NSEQ_STORE(0xF0 | (wc >> 18));
				NSEQ_STORE(0x80 | ((wc >> 12) & 0x3f));
				NSEQ_STORE(0x80 | ((wc >> 6) & 0x3f));
				NSEQ_STORE(0x80 | (wc & 0x3f));
			} else {
				return charcode_error::illegal_input;
			}
			++wseq.it;
			return charcode_error::null;
		}
	};

#undef NSEQ_STORE
};

polymorphic_charcode::~polymorphic_charcode() = default;

// For *decoding, this class assumes that:
// - G::decode_state has members progress() and cpoint().
// - G::decode_state::progress() >= 0 at all times.
//   TODO: This will be needed on platforms like Windows, where wchar_t is UTF-16.
//   TODO: There, we can use negative __mlibc_mbstate::progress to represent encoding to UTF-16.
// - If G::decode_state::progress() == 0, the code point (given by cpoint())
//   was decoded successfully.
template<typename G>
struct polymorphic_charcode_adapter : polymorphic_charcode {
	polymorphic_charcode_adapter()
	: polymorphic_charcode{G::preserves_7bit_units, G::has_shift_states} { }

	charcode_error decode(code_seq<const char> &nseq, code_seq<codepoint> &wseq,
			__mlibc_mbstate &st) override {
		__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().

		code_seq<const char> decode_nseq = nseq;
		typename G::decode_state ds;

		while(decode_nseq && wseq) {
			// Consume the next code unit.
			if(auto e = ds(decode_nseq); e != charcode_error::null)
				return e;

			// Produce a new code point.
			if(!ds.progress()) {
				// "Commit" consumed code units (as there was no decode error).
				nseq.it = decode_nseq.it;
				if(!ds.cpoint()) // Stop on null characters.
					return charcode_error::null;
				*wseq.it = ds.cpoint();
				++wseq.it;
			}
		}

		if(ds.progress())
			return charcode_error::input_underflow;
		return charcode_error::null;
	}

	charcode_error decode_wtranscode(code_seq<const char> &nseq, code_seq<wchar_t> &wseq,
			__mlibc_mbstate &st) override {
		__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().

		code_seq<const char> decode_nseq = nseq;
		typename G::decode_state ds;

		while(decode_nseq && wseq) {
			// Consume the next code unit.
			if(auto e = ds(decode_nseq); e != charcode_error::null)
				return e;

			// Produce a new code point.
			if(!ds.progress()) {
				nseq.it = decode_nseq.it;
				// "Commit" consumed code units (as there was no decode error).
				if(!ds.cpoint()) // Stop on null characters.
					return charcode_error::null;
				*wseq.it = ds.cpoint();
				++wseq.it;
			}
		}

		if(ds.progress())
			return charcode_error::input_underflow;
		return charcode_error::null;
	}

	charcode_error decode_wtranscode_length(code_seq<const char> &nseq, size_t *n,
			__mlibc_mbstate &st) override {
		__ensure(!st.__progress); // TODO: Update st with ds.progress() and ds.cpoint().

		code_seq<const char> decode_nseq = nseq;
		typename G::decode_state ds;

		*n = 0;
		while(decode_nseq) {
			// Consume the next code unit.
			if(auto e = ds(decode_nseq); e != charcode_error::null)
				return e;

			if(!ds.progress()) {
				nseq.it = decode_nseq.it;
				// "Commit" consumed code units (as there was no decode error).
				if(!ds.cpoint()) // Stop on null code points.
					return charcode_error::null;
				++(*n);
			}
		}

		if(ds.progress())
			return charcode_error::input_underflow;
		return charcode_error::null;
	}

	charcode_error encode_wtranscode(code_seq<char> &nseq, code_seq<const wchar_t> &wseq,
			__mlibc_mbstate &st) override {
		__ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().

		code_seq<char> encode_nseq = nseq;
		typename G::encode_state es;

		while(encode_nseq && wseq) {
			codepoint cp = *wseq.it;
			if(!cp)
				return charcode_error::null;

			code_seq<const codepoint> cps{&cp, &cp + 1};
			if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
				continue;
			}else if(e != charcode_error::null) {
				return e;
			}
			__ensure(cps.it == cps.end);
			++wseq.it;

			// "Commit" produced code units (as there was no encode error).
			nseq.it = encode_nseq.it;
		}

		if(encode_nseq.it != nseq.it)
			return charcode_error::output_overflow;
		return charcode_error::null;
	}

	charcode_error encode_wtranscode_length(code_seq<const wchar_t> &wseq, size_t *n,
			__mlibc_mbstate &st) override {
		__ensure(!st.__progress); // TODO: Update st with es.progress() and es.cpoint().

		typename G::encode_state es;

		*n = 0;
		while(wseq) {
			char temp[4];
			code_seq<char> encode_nseq{temp, temp + 4};
			codepoint cp = *wseq.it;
			if(!cp)
				return charcode_error::null;
			// Consume the next code unit.
			code_seq<const codepoint> cps{&cp, &cp + 1};
			if(auto e = es(encode_nseq, cps); e == charcode_error::dirty) {
				continue;
			}else if(e != charcode_error::null) {
				return e;
			}

			++(*n);
			++wseq.it;
		}

		return charcode_error::null;
	}
};

polymorphic_charcode *current_charcode() {
	static polymorphic_charcode_adapter<utf8_charcode> global_charcode;
	return &global_charcode;
}

charcode_error wide_charcode::promote(wchar_t nc, codepoint &wc) {
	// TODO: Allow non-identity encodings of wchar_t.
	wc = nc;
	return charcode_error::null;
}

wide_charcode *platform_wide_charcode() {
	static wide_charcode global_wide_charcode;
	return &global_wide_charcode;
}

} // namespace mlibc