1// (C) Copyright 2016 Jethro G. Beekman 2// 3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 6// option. This file may not be copied, modified, or distributed 7// except according to those terms. 8//! Parsing C literals from byte slices. 9//! 10//! This will parse a representation of a C literal into a Rust type. 11//! 12//! # characters 13//! Character literals are stored into the `CChar` type, which can hold values 14//! that are not valid Unicode code points. ASCII characters are represented as 15//! `char`, literal bytes with the high byte set are converted into the raw 16//! representation. Escape sequences are supported. If hex and octal escapes 17//! map to an ASCII character, that is used, otherwise, the raw encoding is 18//! used, including for values over 255. Unicode escapes are checked for 19//! validity and mapped to `char`. Character sequences are not supported. Width 20//! prefixes are ignored. 21//! 22//! # strings 23//! Strings are interpreted as byte vectors. Escape sequences are supported. If 24//! hex and octal escapes map onto multi-byte characters, they are truncated to 25//! one 8-bit character. Unicode escapes are converted into their UTF-8 26//! encoding. Width prefixes are ignored. 27//! 28//! # integers 29//! Integers are read into `i64`. Binary, octal, decimal and hexadecimal are 30//! all supported. If the literal value is between `i64::MAX` and `u64::MAX`, 31//! it is bit-cast to `i64`. Values over `u64::MAX` cannot be parsed. Width and 32//! sign suffixes are ignored. Sign prefixes are not supported. 33//! 34//! # real numbers 35//! Reals are read into `f64`. Width suffixes are ignored. Sign prefixes are 36//! not supported in the significand. Hexadecimal floating points are not 37//! supported. 38 39use std::char; 40use std::str::{self, FromStr}; 41 42use nom::branch::alt; 43use nom::bytes::complete::is_not; 44use nom::bytes::complete::tag; 45use nom::character::complete::{char, one_of}; 46use nom::combinator::{complete, map, map_opt, opt, recognize}; 47use nom::multi::{fold_many0, many0, many1, many_m_n}; 48use nom::sequence::{delimited, pair, preceded, terminated, tuple}; 49use nom::*; 50 51use crate::expr::EvalResult; 52use crate::ToCexprResult; 53 54#[derive(Debug, Copy, Clone, PartialEq, Eq)] 55/// Representation of a C character 56pub enum CChar { 57 /// A character that can be represented as a `char` 58 Char(char), 59 /// Any other character (8-bit characters, unicode surrogates, etc.) 60 Raw(u64), 61} 62 63impl From<u8> for CChar { 64 fn from(i: u8) -> CChar { 65 match i { 66 0..=0x7f => CChar::Char(i as u8 as char), 67 _ => CChar::Raw(i as u64), 68 } 69 } 70} 71 72// A non-allocating version of this would be nice... 73impl std::convert::Into<Vec<u8>> for CChar { 74 fn into(self) -> Vec<u8> { 75 match self { 76 CChar::Char(c) => { 77 let mut s = String::with_capacity(4); 78 s.extend(&[c]); 79 s.into_bytes() 80 } 81 CChar::Raw(i) => { 82 let mut v = Vec::with_capacity(1); 83 v.push(i as u8); 84 v 85 } 86 } 87 } 88} 89 90/// ensures the child parser consumes the whole input 91pub fn full<I: Clone, O, F>( 92 f: F, 93) -> impl Fn(I) -> nom::IResult<I, O> 94where 95 I: nom::InputLength, 96 F: Fn(I) -> nom::IResult<I, O>, 97{ 98 move |input| { 99 let res = f(input); 100 match res { 101 Ok((i, o)) => { 102 if i.input_len() == 0 { 103 Ok((i, o)) 104 } else { 105 Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::Complete))) 106 } 107 } 108 r => r, 109 } 110 } 111} 112 113// ================================= 114// ======== matching digits ======== 115// ================================= 116 117macro_rules! byte { 118 ($($p: pat)|* ) => {{ 119 fn parser(i: &[u8]) -> crate::nom::IResult<&[u8], u8> { 120 match i.split_first() { 121 $(Some((&c @ $p,rest)))|* => Ok((rest,c)), 122 Some(_) => Err(nom::Err::Error(nom::error::Error::new(i, nom::error::ErrorKind::OneOf))), 123 None => Err(nom::Err::Incomplete(Needed::new(1))), 124 } 125 } 126 127 parser 128 }} 129} 130 131fn binary(i: &[u8]) -> nom::IResult<&[u8], u8> { 132 byte!(b'0'..=b'1')(i) 133} 134 135fn octal(i: &[u8]) -> nom::IResult<&[u8], u8> { 136 byte!(b'0'..=b'7')(i) 137} 138 139fn decimal(i: &[u8]) -> nom::IResult<&[u8], u8> { 140 byte!(b'0'..=b'9')(i) 141} 142 143fn hexadecimal(i: &[u8]) -> nom::IResult<&[u8], u8> { 144 byte!(b'0' ..= b'9' | b'a' ..= b'f' | b'A' ..= b'F')(i) 145} 146 147// ======================================== 148// ======== characters and strings ======== 149// ======================================== 150 151fn escape2char(c: char) -> CChar { 152 CChar::Char(match c { 153 'a' => '\x07', 154 'b' => '\x08', 155 'f' => '\x0c', 156 'n' => '\n', 157 'r' => '\r', 158 't' => '\t', 159 'v' => '\x0b', 160 _ => unreachable!("invalid escape {}", c), 161 }) 162} 163 164fn c_raw_escape(n: Vec<u8>, radix: u32) -> Option<CChar> { 165 str::from_utf8(&n) 166 .ok() 167 .and_then(|i| u64::from_str_radix(i, radix).ok()) 168 .map(|i| match i { 169 0..=0x7f => CChar::Char(i as u8 as char), 170 _ => CChar::Raw(i), 171 }) 172} 173 174fn c_unicode_escape(n: Vec<u8>) -> Option<CChar> { 175 str::from_utf8(&n) 176 .ok() 177 .and_then(|i| u32::from_str_radix(i, 16).ok()) 178 .and_then(char::from_u32) 179 .map(CChar::Char) 180} 181 182fn escaped_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { 183 preceded( 184 char('\\'), 185 alt(( 186 map(one_of(r#"'"?\"#), CChar::Char), 187 map(one_of("abfnrtv"), escape2char), 188 map_opt(many_m_n(1, 3, octal), |v| c_raw_escape(v, 8)), 189 map_opt(preceded(char('x'), many1(hexadecimal)), |v| { 190 c_raw_escape(v, 16) 191 }), 192 map_opt( 193 preceded(char('u'), many_m_n(4, 4, hexadecimal)), 194 c_unicode_escape, 195 ), 196 map_opt( 197 preceded(char('U'), many_m_n(8, 8, hexadecimal)), 198 c_unicode_escape, 199 ), 200 )), 201 )(i) 202} 203 204fn c_width_prefix(i: &[u8]) -> nom::IResult<&[u8], &[u8]> { 205 alt((tag("u8"), tag("u"), tag("U"), tag("L")))(i) 206} 207 208fn c_char(i: &[u8]) -> nom::IResult<&[u8], CChar> { 209 delimited( 210 terminated(opt(c_width_prefix), char('\'')), 211 alt(( 212 escaped_char, 213 map(byte!(0 ..= 91 /* \=92 */ | 93 ..= 255), CChar::from), 214 )), 215 char('\''), 216 )(i) 217} 218 219fn c_string(i: &[u8]) -> nom::IResult<&[u8], Vec<u8>> { 220 delimited( 221 alt((preceded(c_width_prefix, char('"')), char('"'))), 222 fold_many0( 223 alt(( 224 map(escaped_char, |c: CChar| c.into()), 225 map(is_not([b'\\', b'"']), |c: &[u8]| c.into()), 226 )), 227 Vec::new, 228 |mut v: Vec<u8>, res: Vec<u8>| { 229 v.extend_from_slice(&res); 230 v 231 }, 232 ), 233 char('"'), 234 )(i) 235} 236 237// ================================ 238// ======== parse integers ======== 239// ================================ 240 241fn c_int_radix(n: Vec<u8>, radix: u32) -> Option<u64> { 242 str::from_utf8(&n) 243 .ok() 244 .and_then(|i| u64::from_str_radix(i, radix).ok()) 245} 246 247fn take_ul(input: &[u8]) -> IResult<&[u8], &[u8]> { 248 let r = input.split_at_position(|c| c != b'u' && c != b'U' && c != b'l' && c != b'L'); 249 match r { 250 Err(Err::Incomplete(_)) => Ok((&input[input.len()..], input)), 251 res => res, 252 } 253} 254 255fn c_int(i: &[u8]) -> nom::IResult<&[u8], i64> { 256 map( 257 terminated( 258 alt(( 259 map_opt(preceded(tag("0x"), many1(complete(hexadecimal))), |v| { 260 c_int_radix(v, 16) 261 }), 262 map_opt(preceded(tag("0X"), many1(complete(hexadecimal))), |v| { 263 c_int_radix(v, 16) 264 }), 265 map_opt(preceded(tag("0b"), many1(complete(binary))), |v| { 266 c_int_radix(v, 2) 267 }), 268 map_opt(preceded(tag("0B"), many1(complete(binary))), |v| { 269 c_int_radix(v, 2) 270 }), 271 map_opt(preceded(char('0'), many1(complete(octal))), |v| { 272 c_int_radix(v, 8) 273 }), 274 map_opt(many1(complete(decimal)), |v| c_int_radix(v, 10)), 275 |input| Err(crate::nom::Err::Error(nom::error::Error::new(input, crate::nom::ErrorKind::Fix))), 276 )), 277 opt(take_ul), 278 ), 279 |i| i as i64, 280 )(i) 281} 282 283// ============================== 284// ======== parse floats ======== 285// ============================== 286 287fn float_width(i: &[u8]) -> nom::IResult<&[u8], u8> { 288 nom::combinator::complete(byte!(b'f' | b'l' | b'F' | b'L'))(i) 289} 290 291fn float_exp(i: &[u8]) -> nom::IResult<&[u8], (Option<u8>, Vec<u8>)> { 292 preceded( 293 byte!(b'e' | b'E'), 294 pair(opt(byte!(b'-' | b'+')), many1(complete(decimal))), 295 )(i) 296} 297 298fn c_float(i: &[u8]) -> nom::IResult<&[u8], f64> { 299 map_opt( 300 alt(( 301 terminated( 302 recognize(tuple(( 303 many1(complete(decimal)), 304 byte!(b'.'), 305 many0(complete(decimal)), 306 ))), 307 opt(float_width), 308 ), 309 terminated( 310 recognize(tuple(( 311 many0(complete(decimal)), 312 byte!(b'.'), 313 many1(complete(decimal)), 314 ))), 315 opt(float_width), 316 ), 317 terminated( 318 recognize(tuple(( 319 many0(complete(decimal)), 320 opt(byte!(b'.')), 321 many1(complete(decimal)), 322 float_exp, 323 ))), 324 opt(float_width), 325 ), 326 terminated( 327 recognize(tuple(( 328 many1(complete(decimal)), 329 opt(byte!(b'.')), 330 many0(complete(decimal)), 331 float_exp, 332 ))), 333 opt(float_width), 334 ), 335 terminated(recognize(many1(complete(decimal))), float_width), 336 )), 337 |v| str::from_utf8(v).ok().and_then(|i| f64::from_str(i).ok()), 338 )(i) 339} 340 341// ================================ 342// ======== main interface ======== 343// ================================ 344 345fn one_literal(input: &[u8]) -> nom::IResult<&[u8], EvalResult, crate::Error<&[u8]>> { 346 alt(( 347 map(full(c_char), EvalResult::Char), 348 map(full(c_int), |i| EvalResult::Int(::std::num::Wrapping(i))), 349 map(full(c_float), EvalResult::Float), 350 map(full(c_string), EvalResult::Str), 351 ))(input) 352 .to_cexpr_result() 353} 354 355/// Parse a C literal. 356/// 357/// The input must contain exactly the representation of a single literal 358/// token, and in particular no whitespace or sign prefixes. 359pub fn parse(input: &[u8]) -> IResult<&[u8], EvalResult, crate::Error<&[u8]>> { 360 crate::assert_full_parse(one_literal(input)) 361} 362