Try to make the nom errors better! (#22)

xd009642 · web-flow · commit f61d0ab34520 · 2022-09-25T21:38:41.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,13 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 ### Added
 - `InstrumenationProfile::is_empty` to detect when there are no records
+- Fuzzing module for profile files
 
 ### Changed
 - Added anyhow and use in place of `Result<T, Box<dyn Error>>`
+- Make error type for profiles `VerboseError`
 
 ### Fixed
 - Handle merging of completely disjoint records - now profiles generated from multiple
 applications are accurately merged
+- Handle invalid Hash enum variant in `IndexedProfile`
 
 ## [0.2.0] - 2022-09-11
 ### Changed
diff --git a/README.md b/README.md
@@ -4,12 +4,22 @@
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 [![Coverage Status](https://coveralls.io/repos/github/xd009642/llvm-profparser/badge.svg?branch=master)](https://coveralls.io/github/xd009642/llvm-profparser?branch=master)
 
-This is a WIP to parse the llvm instrumentation profraw file format and avoid
-the need to install and use the llvm-profdata binary. 
+This is a reasonably complete to parse the llvm instrumentation profraw file
+format and avoid the need to install and use the llvm-profdata/llvm-cov
+binaries. It aims to be backwards compatible with as many llvm versions that
+could be used for coverage data in Rust projects and currently supports the
+following llvm versions: 11, 12, 13, 14, 15. 
 
 **This project is not affilated with the llvm-project in anyway!** It is merely
 a parser for some of their file formats to aid interoperability in Rust.
 
+## Contributing
+
+All of the functionality required has been implemented, however there are areas
+to improve in handling unexpected or invalid files. To start fining issues
+there's a fuzz directory which will undoubtedly reveal some issues that can be
+fixed. Go into the fuzz directory for guides on how to run. 
+
 ## License
 
 llvm\_profparser is currently licensed under the terms of the Apache License
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
@@ -0,0 +1,3 @@
+target
+corpus
+artifacts
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
@@ -0,0 +1,25 @@
+[package]
+name = "llvm_profparser-fuzz"
+version = "0.0.0"
+authors = ["Automatically generated"]
+publish = false
+edition = "2018"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+
+[dependencies.llvm_profparser]
+path = ".."
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[[bin]]
+name = "profile_data"
+path = "fuzz_targets/profile_data.rs"
+test = false
+doc = false
diff --git a/fuzz/README.md b/fuzz/README.md
@@ -0,0 +1,17 @@
+# llvm-profparser fuzz
+
+This requires cargo-fuzz to be installed and a nightly compiler, to install:
+
+```
+cargo install -f cargo-fuzz
+```
+
+And then to run for the first time:
+
+```
+./setup_corpus.sh
+cargo +nightly fuzz run profile_data
+```
+
+The script `setup_corpus.sh` copies the test files into the corpus directory in
+order to give the fuzzer a good place to start from.
diff --git a/fuzz/fuzz_targets/profile_data.rs b/fuzz/fuzz_targets/profile_data.rs
@@ -0,0 +1,8 @@
+#![no_main]
+use libfuzzer_sys::fuzz_target;
+use llvm_profparser::parse_bytes;
+
+fuzz_target!(|data: &[u8]| {
+    // fuzzed code goes here
+    let _ = parse_bytes(data);
+});
diff --git a/fuzz/setup_corpus.sh b/fuzz/setup_corpus.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+# Here we'll just create an empty corpus folder and copy all the profile data
+# files into it.
+
+mkdir -p corpus/profile_data/
+
+cp ../tests/data/profdata/llvm-11/* corpus/profile_data/
+cp ../tests/data/profdata/llvm-12/* corpus/profile_data/
+cp ../tests/data/profdata/llvm-13/* corpus/profile_data/
+cp ../tests/data/profdata/llvm-14/* corpus/profile_data/
+cp ../tests/data/profdata/llvm-15/* corpus/profile_data/
diff --git a/src/coverage/coverage_mapping.rs b/src/coverage/coverage_mapping.rs
@@ -3,6 +3,7 @@ use crate::coverage::*;
 use crate::instrumentation_profile::types::*;
 use crate::util::*;
 use anyhow::{bail, Result};
+use nom::error::Error as NomError;
 use object::{Endian, Endianness, Object, ObjectSection, Section};
 use std::collections::HashMap;
 use std::convert::TryInto;
@@ -263,8 +264,8 @@ fn parse_coverage_mapping(
 
             //let bytes = &data[16..(16 + filename_data_len as usize)];
             let bytes = &data[16..];
-            let (bytes, file_strings) =
-                parse_path_list(bytes, version).map_err(|_| SectionReadError::InvalidPathList)?;
+            let (bytes, file_strings) = parse_path_list(bytes, version)
+                .map_err(|_: nom::Err<NomError<_>>| SectionReadError::InvalidPathList)?;
             result.insert(hash, file_strings);
             let read_len = data_len - bytes.len();
             let padding = if !bytes.is_empty() && (read_len & 0x07) != 0 {
@@ -305,21 +306,21 @@ fn parse_coverage_functions(
             let _start_len = bytes[28..].len();
             bytes = &bytes[28..];
 
-            let (data, id_len) = parse_leb128(bytes).unwrap();
+            let (data, id_len) = parse_leb128::<NomError<_>>(bytes).unwrap();
             bytes = data;
             let mut filename_indices = vec![];
             for _ in 0..id_len {
-                let (data, id) = parse_leb128(bytes).unwrap(); // Issue
+                let (data, id) = parse_leb128::<NomError<_>>(bytes).unwrap(); // Issue
                 filename_indices.push(id);
                 bytes = data;
             }
-            let (data, expr_len) = parse_leb128(bytes).unwrap();
+            let (data, expr_len) = parse_leb128::<NomError<_>>(bytes).unwrap();
             let expr_len = expr_len as usize;
             bytes = data;
             let mut exprs = vec![Expression::default(); expr_len];
             for i in 0..expr_len {
-                let (data, lhs) = parse_leb128(bytes).unwrap();
-                let (data, rhs) = parse_leb128(data).unwrap();
+                let (data, lhs) = parse_leb128::<NomError<_>>(bytes).unwrap();
+                let (data, rhs) = parse_leb128::<NomError<_>>(data).unwrap();
                 let lhs = parse_counter(lhs, &mut exprs);
                 let rhs = parse_counter(rhs, &mut exprs);
                 exprs[i].lhs = lhs;
diff --git a/src/hash_table.rs b/src/hash_table.rs
@@ -1,6 +1,9 @@
-use crate::instrumentation_profile::types::*;
+use crate::instrumentation_profile::{types::*, ParseResult};
 use indexmap::IndexMap;
-use nom::{number::complete::*, IResult};
+use nom::{
+    error::{ErrorKind, ParseError, VerboseError, VerboseErrorKind},
+    number::complete::*,
+};
 use std::borrow::Cow;
 
 #[derive(Copy, Clone, Debug)]
@@ -12,26 +15,43 @@ struct KeyDataLen {
 #[derive(Clone, Debug)]
 pub(crate) struct HashTable(pub IndexMap<(u64, String), InstrProfRecord>);
 
-fn read_key_data_len(input: &[u8]) -> IResult<&[u8], KeyDataLen> {
+fn read_key_data_len(input: &[u8]) -> ParseResult<KeyDataLen> {
     let (bytes, key_len) = le_u64(input)?;
     let (bytes, data_len) = le_u64(bytes)?;
     let res = KeyDataLen { key_len, data_len };
     Ok((bytes, res))
 }
 
-fn read_key(input: &[u8], key_len: usize) -> IResult<&[u8], Cow<'_, str>> {
-    let res = String::from_utf8_lossy(&input[..key_len]);
-    Ok((&input[key_len..], res))
+fn read_key(input: &[u8], key_len: usize) -> ParseResult<Cow<'_, str>> {
+    if key_len > input.len() {
+        Err(nom::Err::Failure(VerboseError::from_error_kind(
+            &input[input.len()..],
+            ErrorKind::Eof,
+        )))
+    } else {
+        let res = String::from_utf8_lossy(&input[..key_len]);
+        Ok((&input[key_len..], res))
+    }
 }
 
 fn read_value(
     version: u64,
     mut input: &[u8],
     data_len: usize,
-) -> IResult<&[u8], (u64, InstrProfRecord)> {
+) -> ParseResult<(u64, InstrProfRecord)> {
     if data_len % 8 != 0 {
         // Element is corrupted, it should be aligned
-        todo!();
+        let errors = vec![(
+            input,
+            VerboseErrorKind::Context("table data length is not 8 byte aligned"),
+        )];
+        return Err(nom::Err::Failure(VerboseError { errors }));
+    }
+    if input.len() < data_len {
+        return Err(nom::Err::Failure(VerboseError::from_error_kind(
+            &input[input.len()..],
+            ErrorKind::Eof,
+        )));
     }
     let mut result = vec![];
     let end_len = input.len() - data_len;
@@ -102,12 +122,12 @@ impl HashTable {
     /// buckets is the data the hash table buckets start at - the start of the `HashTable` in memory.
     /// hash. offset shows the offset from the base address to the start of the `HashTable` as this
     /// will be used to correct any offsets
-    pub(crate) fn parse(
+    pub(crate) fn parse<'a>(
         version: u64,
-        input: &[u8],
+        input: &'a [u8],
         _offset: usize,
         bucket_start: usize,
-    ) -> IResult<&[u8], Self> {
+    ) -> ParseResult<'a, Self> {
         assert!(bucket_start > 0);
         let (bytes, _num_buckets) = le_u64(&input[bucket_start..])?;
         let (_bytes, mut num_entries) = le_u64(bytes)?;
@@ -126,7 +146,7 @@ impl HashTable {
         version: u64,
         input: &'a [u8],
         mut num_entries: u64,
-    ) -> IResult<&'a [u8], u64> {
+    ) -> ParseResult<'a, u64> {
         let (bytes, num_items_in_bucket) = le_u16(input)?;
         let mut remaining = bytes;
         for _i in 0..num_items_in_bucket {
diff --git a/src/instrumentation_profile/indexed_profile.rs b/src/instrumentation_profile/indexed_profile.rs
@@ -3,8 +3,8 @@ use crate::instrumentation_profile::*;
 use crate::summary::*;
 use anyhow::bail;
 use nom::{
+    error::{ContextError, ErrorKind, ParseError, VerboseError},
     number::{complete::*, Endianness},
-    IResult,
 };
 use std::collections::HashMap;
 use std::convert::TryFrom;
@@ -82,7 +82,7 @@ fn parse_summary<'a>(
     mut input: &'a [u8],
     header: &Header,
     use_cs: bool,
-) -> IResult<&'a [u8], Option<ProfileSummary>> {
+) -> ParseResult<'a, Option<ProfileSummary>> {
     if header.version() >= 4 {
         let (bytes, n_fields) = le_u64(input)?;
         let (bytes, n_entries) = le_u64(bytes)?;
@@ -154,7 +154,7 @@ fn parse_summary<'a>(
 impl InstrProfReader for IndexedInstrProf {
     type Header = Header;
 
-    fn parse_bytes(mut input: &[u8]) -> IResult<&[u8], InstrumentationProfile> {
+    fn parse_bytes(mut input: &[u8]) -> ParseResult<InstrumentationProfile> {
         let (bytes, header) = Self::parse_header(input)?;
         let (bytes, _summary) = parse_summary(bytes, &header, false)?;
         let (bytes, _cs_summary) = if header.is_csir_prof() {
@@ -192,13 +192,20 @@ impl InstrProfReader for IndexedInstrProf {
         Ok((input, profile))
     }
 
-    fn parse_header(input: &[u8]) -> IResult<&[u8], Self::Header> {
+    fn parse_header(input: &[u8]) -> ParseResult<Self::Header> {
         if Self::has_format(input) {
             let (bytes, version) = le_u64(&input[8..])?;
             let (bytes, _) = le_u64(bytes)?;
             let (bytes, hash_type) = le_u64(bytes)?;
+            let hash_type = HashType::try_from(hash_type).map_err(|_e| {
+                let error = VerboseError::from_error_kind(bytes, ErrorKind::Satisfy);
+                nom::Err::Failure(VerboseError::add_context(
+                    bytes,
+                    "invalid enum variant for profile hash",
+                    error,
+                ))
+            })?;
             let (bytes, hash_offset) = le_u64(bytes)?;
-            let hash_type = HashType::try_from(hash_type).expect("BAD ENUM BRUH");
             Ok((
                 bytes,
                 Self::Header {
diff --git a/src/instrumentation_profile/mod.rs b/src/instrumentation_profile/mod.rs
@@ -2,7 +2,7 @@ use crate::instrumentation_profile::indexed_profile::*;
 use crate::instrumentation_profile::raw_profile::*;
 use crate::instrumentation_profile::text_profile::*;
 use crate::instrumentation_profile::types::*;
-use nom::IResult;
+use nom::{error::VerboseError, IResult};
 use std::fs::File;
 use std::io;
 use std::io::prelude::*;
@@ -16,6 +16,8 @@ pub mod summary;
 pub mod text_profile;
 pub mod types;
 
+pub type ParseResult<'a, T> = IResult<&'a [u8], T, VerboseError<&'a [u8]>>;
+
 pub const fn get_num_padding_bytes(len: u64) -> u8 {
     7 & (8 - (len % 8) as u8)
 }
@@ -47,18 +49,19 @@ pub fn parse_bytes(data: &[u8]) -> io::Result<InstrumentationProfile> {
             "Unsupported instrumentation profile format",
         ));
     };
-    nom_res.map(|(_bytes, res)| res).map_err(|e| {
-        println!("Parsing failed: {}", e);
+    nom_res.map(|(_bytes, res)| res).map_err(|_e| {
+        #[cfg(test)]
+        println!("{}", _e);
         io::Error::new(io::ErrorKind::Other, "Parsing failed")
     })
 }
 
 pub trait InstrProfReader {
     type Header;
     /// Parse the profile no lazy parsing here!
-    fn parse_bytes(input: &[u8]) -> IResult<&[u8], InstrumentationProfile>;
+    fn parse_bytes(input: &[u8]) -> ParseResult<InstrumentationProfile>;
     /// Parses a header
-    fn parse_header(input: &[u8]) -> IResult<&[u8], Self::Header>;
+    fn parse_header(input: &[u8]) -> ParseResult<Self::Header>;
     /// Detects that the bytes match the current reader format if it can't read the format it will
     /// return false
     fn has_format(input: impl Read) -> bool;
diff --git a/src/instrumentation_profile/raw_profile.rs b/src/instrumentation_profile/raw_profile.rs
diff --git a/src/instrumentation_profile/text_profile.rs b/src/instrumentation_profile/text_profile.rs
diff --git a/src/util.rs b/src/util.rs
diff --git a/tests/cov.rs b/tests/cov.rs