Skip to content

Commit

Permalink
temp
Browse files Browse the repository at this point in the history
  • Loading branch information
dralley committed Jul 25, 2022
1 parent 6d883b5 commit b16c52e
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 50 deletions.
165 changes: 164 additions & 1 deletion src/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
//! A module for wrappers that encode / decode data.

use std::borrow::Cow;
use std::io::{self, BufRead, Read};

#[cfg(feature = "encoding")]
use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
use encoding_rs::{Decoder as ExtDecoder, Encoding, UTF_16BE, UTF_16LE, UTF_8, CoderResult};

use crate::{Error, Result};

Expand Down Expand Up @@ -184,4 +185,166 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
}
}

/// A reference to an encoding together with information about how it was retrieved.
///
/// The state transition diagram:
///
/// ```mermaid
/// flowchart LR
/// Implicit -- from_str --> Explicit
/// Implicit -- BOM --> BomDetected
/// Implicit -- "encoding=..." --> XmlDetected
/// BomDetected -- "encoding=..." --> XmlDetected
/// ```
#[cfg(feature = "encoding")]
#[derive(Clone, Copy)]
pub(crate) enum EncodingRef {
/// Encoding was implicitly assumed to have a specified value. It can be refined
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
Implicit(&'static Encoding),
/// Encoding was explicitly set to the desired value. It cannot be changed
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
Explicit(&'static Encoding),
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
BomDetected(&'static Encoding),
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
/// It can no longer change
XmlDetected(&'static Encoding),
}
#[cfg(feature = "encoding")]
impl EncodingRef {
#[inline]
pub(crate) fn encoding(&self) -> &'static Encoding {
match self {
Self::Implicit(e) => e,
Self::Explicit(e) => e,
Self::BomDetected(e) => e,
Self::XmlDetected(e) => e,
}
}
#[inline]
pub(crate) fn can_be_refined(&self) -> bool {
match self {
Self::Implicit(_) | Self::BomDetected(_) => true,
Self::Explicit(_) | Self::XmlDetected(_) => false,
}
}
}

#[cfg(feature = "encoding")]

struct DecodingBufReader<R> {
// // The buffer
// buffer: String,
// // How many bytes in the buffer currently hold significant data.
// current_position: usize,

// /// Track whether we see errors.
// encoding: Option<Encoding>,

inner: R,
decoded_buffer: Vec<u8>,
current_pos: usize,

decoder: ExtDecoder,
encoding: EncodingRef,
}

#[cfg(feature = "encoding")]
impl<R: BufRead> BufRead for DecodingBufReader<R> {
fn fill_buf(&mut self) -> io::Result<&[u8]> {
self.shuffle();
let data = self.inner.fill_buf()?;

let amount_read_from_inner = self.feed(data)?;
self.inner.consume(amount_read_from_inner);

Ok(data)
}

fn consume(&mut self, amt: usize) {
self.current_pos = std::cmp::min(self.current_pos + amt, self.decoded_buffer.capacity());
}
}


#[cfg(feature = "encoding")]
impl<R: Read> Read for DecodingBufReader<R> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.inner.read(buf)
}
}

#[cfg(feature = "encoding")]
impl<R: BufRead> DecodingBufReader<R> {
fn new(inner: R) -> Self {
DecodingBufReader {
inner: inner,
decoded_buffer: Vec::new(),
current_pos: 0,

decoder: UTF_8.new_decoder(),
encoding: EncodingRef::Implicit(UTF_8),
}
}

fn get_raw_buffer(&mut self) -> io::Result<&[u8]> {
self.inner.fill_buf()
}

/// Move unconsumed data to the front of the buffer and reset the length
fn shuffle(&mut self) {
if self.current_pos == 0 {
return;
}

// Copy all unconsumed bytes to the beginning of the buffer
self.decoded_buffer.as_mut_slice().copy_within(self.current_pos.., 0);
// Truncate the buffer
self.decoded_buffer.truncate(self.decoded_buffer.len() - self.current_pos);
self.current_pos = 0;
}

/// Reallocate a smaller buffer with the provided size
fn shrink_buffer(&mut self, size: usize) {
self.shuffle();
self.decoded_buffer.shrink_to(size);
}

fn set_encoding(&mut self, encoding: &'static Encoding) {
self.encoding = EncodingRef::Explicit(encoding);
}

fn feed(&mut self, data: &[u8]) -> io::Result<usize> {
// reserve (at least) enough space in our buffer to hold the decoded data
// encoding::max_utf8_buffer_length(data.len())
self.decoded_buffer.reserve(data.len());

// The number of bytes already read from current `input` in total.
let (result, read, written, had_errors) =
self.decoder.decode_to_utf8(&data[..],
&mut self.decoded_buffer[self.current_pos..],
data.is_empty());
self.current_pos += written;
match result {
CoderResult::InputEmpty => {
// We have consumed the current input buffer.
match had_errors {
true => Err(io::Error::new(io::ErrorKind::Other, "Errors decoding")),
false => Ok(read),
}
},
CoderResult::OutputFull => unreachable!("This shouldn't happen, we reserved space"),
}
}
}

#[cfg(test)]
mod tests {

}



// TODO: add some tests for functions
51 changes: 2 additions & 49 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
use std::str::from_utf8;

#[cfg(feature = "encoding")]
use encoding_rs::{Encoding, UTF_8};
use encoding_rs::UTF_8;

#[cfg(feature = "encoding")]
use crate::encoding::detect_encoding;
use crate::encoding::{detect_encoding, EncodingRef};
use crate::encoding::Decoder;
use crate::errors::{Error, Result};
use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
Expand Down Expand Up @@ -179,53 +179,6 @@ enum TagState {
Exit,
}

/// A reference to an encoding together with information about how it was retrieved.
///
/// The state transition diagram:
///
/// ```mermaid
/// flowchart LR
/// Implicit -- from_str --> Explicit
/// Implicit -- BOM --> BomDetected
/// Implicit -- "encoding=..." --> XmlDetected
/// BomDetected -- "encoding=..." --> XmlDetected
/// ```
#[cfg(feature = "encoding")]
#[derive(Clone, Copy)]
enum EncodingRef {
/// Encoding was implicitly assumed to have a specified value. It can be refined
/// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
Implicit(&'static Encoding),
/// Encoding was explicitly set to the desired value. It cannot be changed
/// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
Explicit(&'static Encoding),
/// Encoding was detected from a byte order mark (BOM) or by the first bytes
/// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
BomDetected(&'static Encoding),
/// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
/// It can no longer change
XmlDetected(&'static Encoding),
}
#[cfg(feature = "encoding")]
impl EncodingRef {
#[inline]
fn encoding(&self) -> &'static Encoding {
match self {
Self::Implicit(e) => e,
Self::Explicit(e) => e,
Self::BomDetected(e) => e,
Self::XmlDetected(e) => e,
}
}
#[inline]
fn can_be_refined(&self) -> bool {
match self {
Self::Implicit(_) | Self::BomDetected(_) => true,
Self::Explicit(_) | Self::XmlDetected(_) => false,
}
}
}

////////////////////////////////////////////////////////////////////////////////////////////////////

/// A low level encoding-agnostic XML event reader.
Expand Down

0 comments on commit b16c52e

Please sign in to comment.