Add Bytes as the new string type

This replaces IrcStr.
This commit is contained in:
TheDaemoness
2023-04-03 12:27:37 -07:00
parent a5d4f9f969
commit 69bb0c85c8
3 changed files with 374 additions and 8 deletions

View File

@@ -11,7 +11,7 @@
//pub mod msg;
//pub mod sasl;
//pub mod state;
//mod string;
pub mod string;
//pub use crate::string::strmap;
//pub use crate::string::*;

View File

@@ -1,8 +1,57 @@
#[cfg(feature = "base64")]
pub mod base64;
mod ircstr;
mod ircword;
pub mod strmap;
//! Byte strings and string manipulation utilities.
//!
//! The core primitive of vinezombie is an immutable byte string type
//! that can either borrow or have shared ownership of its contents.
pub use ircstr::IrcStr;
pub use ircword::IrcWord;
//#[cfg(feature = "base64")]
//pub mod base64;
mod bytes;
//pub mod strmap;
pub use bytes::Bytes;
//pub use ircstr::IrcStr;
//pub use ircword::IrcWord;
use std::borrow::Cow;
/// Types that represent byte string tranformations.
///
/// # Safety
/// `bytes` may have a forged lifetime, often `'static`.
/// Implementors MUST NOT store anything with a lifetime that is bounded by `'a`.
///
/// The `utf8` field of the returned [`Transformation`] is trusted to be correct,
/// and byte slices may be incorrectly cast unchecked to `str`s otherwise.
pub unsafe trait Transform {
/// The type of values yielded in addition to a transformed byte string.
type Value;
/// Transforms a byte string.
fn transform<'a>(&self, bytes: &'a [u8]) -> Transformation<'a, Self::Value>;
}
/// The result of a byte string transformation, as returned by [`Transform::transform()`].
pub struct Transformation<'a, T> {
/// An additional value yielded by this transformation. Often `()`.
pub value: T,
/// The transformed string.
pub transformed: Cow<'a, [u8]>,
/// The UTF-8 validity of `transformed`. See [`Utf8Policy`].
pub utf8: Utf8Policy,
}
/// The UTF-8 validity of a transformed string based on the input string.
#[repr(i8)]
#[derive(Clone, Copy, PartialEq, Eq, Default, Debug)]
pub enum Utf8Policy {
/// The returned slice is NOT valid UTF-8.
Invalid = -1,
/// The returned slice has unknown UTF-8 validity.
#[default]
Recheck = 0,
/// The returned slice is valid UTF-8.
Valid = 1,
/// The returned slice is valid UTF-8 if the input slice was valid UTF-8.
Preserve,
/// The returned slice is valid UTF-8 if and only if the input slice was valid UTF-8.
PreserveStrict,
}

317
src/string/bytes.rs Normal file
View File

@@ -0,0 +1,317 @@
use super::{Transform, Utf8Policy};
use std::{
borrow::Cow,
ops::Deref,
ptr::NonNull,
sync::atomic::{AtomicI8, Ordering::Relaxed},
sync::atomic::{AtomicUsize, Ordering},
};
/// A borrowing or shared-owning immutable byte string. Not to be confused with Bytes
/// from the crate of the same name.
#[derive(Default)]
pub struct Bytes<'a> {
value: &'a [u8],
/// If this is Some, `value` points to data owned by this.
/// It's very important that the slice never be returned with
/// a lifetime longer than the IrcStr it was obvained from.
ownership: Option<OwnedBytes>,
/// The result of UTF-8 validity checks.
/// 0 if "unknown", 1 if UTF-8, -1 if NOT UTF-8.
utf8: AtomicI8,
}
impl<'a> Bytes<'a> {
/// Return an owning version of this string.
///
/// If this string already owns its data, this method only extends its lifetime.
pub fn owning(&self) -> Bytes<'static> {
if self.ownership.is_some() {
// Lifetime extension.
unsafe { std::mem::transmute(self.clone()) }
} else {
unsafe {
let (owned, value) = OwnedBytes::from_vec(self.value.to_vec());
Bytes { value, ownership: Some(owned), utf8: self.utf8.load(Relaxed).into() }
}
}
}
// TODO: Are the "borrowed" methods from IrcStr needed?
// They haven't really been necessary IME,
// and with the UTF-8 checks they result in a lot of duplication,
// especially if one finds a need for to_borrowed_or_cloned.
/// Returns true if this byte string is empty.
pub fn is_empty(&self) -> bool {
self.value.is_empty()
}
/// Returns the length of this byte string.
pub fn len(&self) -> usize {
self.value.len()
}
/// Returns a reference to `self`'s value as a UTF-8 string if it's correctly encoded.
///
/// This operation may do a UTF-8 validity check.
/// If `self` was constructed from a UTF-8 string
/// or a UTF-8 check was done previously, this check will be skipped.
pub fn to_utf8(&self) -> Option<&str> {
match self.utf8.load(Relaxed) {
1 => Some(unsafe { std::str::from_utf8_unchecked(self.value) }),
-1 => None,
_ => {
let so = std::str::from_utf8(self.value).ok();
let utf8 = if so.is_some() { 1i8 } else { -1i8 };
self.utf8.store(utf8, Relaxed);
so
}
}
}
/// Returns a clone of `self` as a UTF-8 string,
/// replacing any non-UTF-8 byte sequences with the the
/// [U+FFFD replacement character](std::char::REPLACEMENT_CHARACTER).
pub fn to_utf8_lossy(&self) -> Self {
let update = match self.utf8.load(Relaxed) {
1 => Cow::Borrowed(unsafe { std::str::from_utf8_unchecked(self.value) }),
-1 => String::from_utf8_lossy(self.value),
_ => {
let sl = String::from_utf8_lossy(self.value);
let utf8 = if matches!(&sl, Cow::Borrowed(_)) { 1i8 } else { -1i8 };
self.utf8.store(utf8, Relaxed);
sl
}
};
match update {
Cow::Borrowed(s) => {
Bytes { value: s.as_bytes(), ownership: self.ownership.clone(), utf8: 1i8.into() }
}
Cow::Owned(o) => o.into(),
}
}
/// Updates `self` using the provided [`Transform`].
pub fn transform<T: Transform + ?Sized>(&mut self, tf: &T) -> T::Value {
let tfed = tf.transform(self.value);
match tfed.transformed {
Cow::Borrowed(s) => {
match tfed.utf8 {
Utf8Policy::PreserveStrict => (),
Utf8Policy::Preserve => {
let _ = self.utf8.compare_exchange(-1i8, 0i8, Relaxed, Relaxed);
}
Utf8Policy::Invalid | Utf8Policy::Recheck | Utf8Policy::Valid => {
self.utf8.store(tfed.utf8 as i8, Relaxed);
}
}
self.value = s
}
Cow::Owned(o) => {
let utf8 = match tfed.utf8 {
Utf8Policy::PreserveStrict => self.utf8.load(Relaxed),
Utf8Policy::Preserve => (self.utf8.load(Relaxed) == 1) as i8,
Utf8Policy::Invalid | Utf8Policy::Recheck | Utf8Policy::Valid => {
tfed.utf8 as i8
}
};
unsafe {
let (owned, value) = OwnedBytes::from_vec(o);
*self = Bytes { value, ownership: Some(owned), utf8: utf8.into() };
}
}
}
tfed.value
}
}
impl<'a> Deref for Bytes<'a> {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.value
}
}
impl AsRef<[u8]> for Bytes<'_> {
fn as_ref(&self) -> &[u8] {
self.value
}
}
impl std::borrow::Borrow<[u8]> for Bytes<'_> {
fn borrow(&self) -> &[u8] {
self.value
}
}
// Conversions to IrcStr.
impl From<Vec<u8>> for Bytes<'static> {
fn from(value: Vec<u8>) -> Self {
unsafe {
let (owned, value) = OwnedBytes::from_vec(value);
Bytes { value, ownership: Some(owned), utf8: 0i8.into() }
}
}
}
impl From<String> for Bytes<'static> {
fn from(value: String) -> Self {
unsafe {
let (owned, value) = OwnedBytes::from_vec(value.into_bytes());
Bytes { value, ownership: Some(owned), utf8: 1i8.into() }
}
}
}
impl<'a> From<&'a [u8]> for Bytes<'a> {
fn from(value: &'a [u8]) -> Self {
value.to_vec().into()
}
}
impl<'a> From<&'a str> for Bytes<'a> {
fn from(value: &'a str) -> Self {
value.to_owned().into()
}
}
impl<'a> From<Cow<'a, [u8]>> for Bytes<'a> {
fn from(value: Cow<'a, [u8]>) -> Self {
match value {
Cow::Borrowed(s) => s.into(),
Cow::Owned(s) => s.into(),
}
}
}
impl<'a> From<Cow<'a, str>> for Bytes<'a> {
fn from(value: Cow<'a, str>) -> Self {
match value {
Cow::Borrowed(s) => s.into(),
Cow::Owned(s) => s.into(),
}
}
}
// Other impls.
impl Clone for Bytes<'_> {
fn clone(&self) -> Self {
Bytes {
value: self.value,
ownership: self.ownership.clone(),
utf8: self.utf8.load(Relaxed).into(),
}
}
}
impl PartialEq<[u8]> for Bytes<'_> {
fn eq(&self, other: &[u8]) -> bool {
self.value == other
}
}
impl PartialEq for Bytes<'_> {
fn eq(&self, b: &Bytes<'_>) -> bool {
self.value == b.value
}
}
impl Eq for Bytes<'_> {}
impl PartialOrd for Bytes<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Bytes<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.value.cmp(other.value)
}
}
impl std::hash::Hash for Bytes<'_> {
fn hash<H: std::hash::Hasher>(&self, hasher: &mut H) {
self.value.hash(hasher)
}
}
impl std::fmt::Display for Bytes<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(&String::from_utf8_lossy(self.as_ref()))
}
}
impl std::fmt::Debug for Bytes<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let owning = self.ownership.is_some();
let mut f = f.debug_struct("StrBytes");
let f = f.field("owning", &owning);
if let Some(v) = self.to_utf8() {
f.field("value", &v)
} else {
f.field("value", &self.value)
}
.finish()
}
}
/// Implementation detail of Bytes.
/// A reference-counted dynamically-sized byte array.
///
/// It is designed to allow dirt-cheap conversions from Vec.
/// It returns a reference to the content on construction
/// but otherwise provides no access to the content.
/// There is also only a strong reference count, which is kept seperate.
struct OwnedBytes {
rc: NonNull<AtomicUsize>,
data: NonNull<u8>,
size: usize,
}
impl OwnedBytes {
/// Converts a Vec into Self (which owns the data)
/// and a slice with an unbound lifetime (which does not).
///
/// # Safety
/// Unbound lifetimes are the devil.
pub unsafe fn from_vec<'a>(mut value: Vec<u8>) -> (Self, &'a [u8]) {
// We're at the mercy of what the global allocator does here,
// but at least this potentially does NOT copy.
value.shrink_to_fit();
// SAFE: as_mut_ptr returns a dangling pointer valid for 0-size reads
// if the vector did not allocate.
// https://doc.rust-lang.org/std/vec/struct.Vec.html#method.as_mut_ptr
let data = NonNull::new_unchecked(value.as_mut_ptr());
let len = value.len();
let size = value.capacity();
std::mem::forget(value);
// SAFE: into_raw returns a non-null pointer.
// https://doc.rust-lang.org/std/boxed/struct.Box.html#method.into_raw
let rc = NonNull::new_unchecked(Box::into_raw(Box::new(AtomicUsize::new(1))));
let retval = OwnedBytes { rc, data, size };
(retval, std::slice::from_raw_parts(data.as_ptr().cast_const(), len))
}
}
impl Clone for OwnedBytes {
fn clone(&self) -> Self {
let rc = unsafe { self.rc.as_ref() };
rc.fetch_add(1, Ordering::Relaxed);
OwnedBytes { rc: self.rc, data: self.data, size: self.size }
}
}
impl Drop for OwnedBytes {
fn drop(&mut self) {
unsafe {
let rc = self.rc.as_ref();
if rc.fetch_sub(1, Ordering::Release) == 1 {
std::mem::drop(Vec::from_raw_parts(self.data.as_ptr(), 0, self.size));
std::mem::drop(Box::from_raw(self.rc.as_mut()));
}
}
}
}
// Unfortunately, address sanitizer support in rustc is still unstable.
// https://github.com/rust-lang/rust/issues/39699