summaryrefslogtreecommitdiff
path: root/src/encoding.rs
blob: 598f993e2253bc97ec3b009be2a6c75846b48da2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
use std::convert::TryFrom;
use std::io::{self, Write};
use crate::error::IntoIOError;

macro_rules! guard {
    ($cond:expr) => {
        if !$cond {
            return None;
        }
    }
}

pub fn from_latin_1(bytes: &[u8]) -> Option<String> {
    guard!(bytes.len() >= 1);

    Some(bytes.iter().map(|&b| char::from(b)).collect())
}

pub fn from_ucs_2_bom(bytes: &[u8]) -> Option<String> {
    let len = bytes.len();

    guard!(len % 2 == 0);
    guard!(len >= 2);

    let bom = read_big_endian(&bytes[0..2], 8);
    guard!(bom == 0xfeff || bom == 0xfffe);

    let mut res = String::with_capacity(len/2 - 1);

    for i in 1 .. len/2 {
        let val = if bom == 0xfeff {
            256 * bytes[2*i] as u16 + bytes[2*i+1] as u16
        } else {
            256 * bytes[2*i+1] as u16 + bytes[2*i] as u16
        };

        match char::try_from(u32::from(val)) {
            Ok(c) => res.push(c),
            Err(_) => return None,
        }
    }

    Some(res)
}

pub fn from_utf16_genericbom(bytes: &[u8], bom: u16) -> Option<String> {
    let len = bytes.len();
    guard!(len % 2 == 0);

    let mut nibbles = Vec::with_capacity(len/2);

    match bom {
        0xfeff => {
            for i in 1..len/2 {
                nibbles.push((u16::from(bytes[2*i]) << 8) | u16::from(bytes[2*i+1]));
            }
        }

        0xfffe => {
            for i in 1..len/2 {
                nibbles.push((u16::from(bytes[2*i+1]) << 8) | u16::from(bytes[2*i]));
            }
        }

        _ => {  // Invalid BOM
            return None
        }
    }

    String::from_utf16(&nibbles).ok()
}

pub fn from_utf16_bom(bytes: &[u8]) -> Option<String> {
    guard!(bytes.len() >= 2);
    from_utf16_genericbom(&bytes[2..], read_big_endian(&bytes[0..2], 8) as u16)
}

pub fn from_utf16_nobom(bytes: &[u8]) -> Option<String> {
    from_utf16_genericbom(bytes, 0xfeff)
}

pub fn from_utf8_mistaken_as_latin1(latin1: &str) -> io::Result<String> {
    latin1
        .chars()
        .map(|c| u8::try_from(u32::from(c)))
        .collect::<Result<Vec<u8>, _>>()
        .map_err(|e| e.ioerr())
        .and_then(|v| std::str::from_utf8(&v)
                        .map(|s| s.to_string())
                        .map_err(|e| e.ioerr()))
}

pub fn read_big_endian(bytes: &[u8], bits: usize) -> usize {
    if bits > 8 {
        panic!("Invalid number of bits in encoding::read_big_endian()");
    }

    bytes
        .iter()
        .enumerate()
        .map(|(i, &b)| (b as usize) << (bits * (bytes.len() - 1 - i)))
        .sum()
}

pub fn write_big_endian<W: Write>(mut stream: W, mut value: usize, num_bytes: usize, bits: usize) -> io::Result<()> {
    if num_bytes > 8 {
        panic!("Invalid number of bytes in encoding::write_big_endian()");
    }
    if bits > 8 || bits == 0 {
        panic!("Invalid number of bits in encoding::write_big_endian()");
    }

    let mask = (1 << bits) - 1;
    let mut bytes = [0u8; 8];
    for i in (0..num_bytes).rev() {
        bytes[i] = (value & mask) as u8;
        value >>= bits;
    }

    if value != 0 {
        panic!("Value doesn't fit in encoding::write_big_endian()");
    }

    stream.write_all(&bytes[0..num_bytes])
}