diff --git a/src/meta/mod.rs b/src/meta/mod.rs index 4a766672..db7bc1d6 100644 --- a/src/meta/mod.rs +++ b/src/meta/mod.rs @@ -56,36 +56,11 @@ //! "claimed". This way, if a crash occurs, the previous fixed metadata will still report `a b //! c d e` as orphans. //! -//! * The metadata is committed again and another transaction comes in. The transaction reclaims 2 -//! pages (`c` and `d`) and orphans 2 pages (`h` and `i`). The state will be recorded as -//! follows: +//! * Subsequent transactions will have a similar effect, with new orphans being appended to the +//! right of the list, and old orphans being reclaimed from the left. //! -//! ```text -//! page list: [ h i c d e f g ] -//! └┬┘ └┬┘ └─┬─┘ -//! orphans│ orphans -//! claimed -//! ``` -//! -//! In words, the new orphans are written in place of the previously claimed pages. Like -//! before, if a crash occurs, the previous fixed metadata will still report `c d e f g` as -//! orphans. -//! -//! * Let's now supposed a transaction reclaims 2 pages (`e` and `f`) and orphans 5 pages (`j`, -//! `k`, `l`, `m`, `n`). This will be the new list: -//! -//! ```text -//! page list: [ h i j k e f g l m n ] -//! └──┬──┘ └┬┘ └──┬──┘ -//! orphans │ orphans -//! claimed -//! ``` -//! -//! In words, orphaned pages are added to the previous claimed section until full, and then -//! appended to the end of the list. -//! -//! To implement this mechanism, two field are required: the size of the page list, and the range -//! of claimed pages. +//! * To prevent the orphan list from growing too big, the list is periodically compactified by +//! overwriting the claimed section of the list with the actual orphans. mod error; @@ -138,25 +113,6 @@ impl Not for SlotIndex { /// * root node information; /// * number of allocated pages; /// * orphan page information. -/// -/// The orphan page information is used to interpret the contents of the `orphan_pages` list. This -/// list may contain pages that are really orphan and unused, and pages that were orphan in the -/// previous snapshot and were later reclaimed. See the [module-level documentation](self) for more -/// information about why `orphan_pages` contains reclaimed pages. -/// -/// The list of reclaimed pages is a contiguous slice inside `orphan_pages` and the meaning of the -/// field in this structure is explained by the following diagram: -/// -/// ```text -/// reclaimed pages -/// ╷ -/// ┌──┴──┐ -/// orphan_pages: [ a b c d e f g h ] -/// │ │ │ -/// │ │ └╴orphan_pages_len -/// │ └╴reclaimed_orphans_end -/// └╴reclaimed_orphans_start -/// ``` #[repr(C)] #[derive(FromBytes, IntoBytes, Immutable, KnownLayout, ByteEq, Clone, Debug)] pub struct MetadataSlot { @@ -170,13 +126,8 @@ pub struct MetadataSlot { page_count: u32, /// Total size of `orphan_pages` (including reclaimed pages) orphan_pages_len: u32, - /// Index of the first reclaimed page inside of `orphan_pages` - reclaimed_orphans_start: u32, - /// Number of reclaimed pages inside of `orphan_pages` - reclaimed_orphans_end: u32, - /// Unused data to allow this structure to be properly aligned. This padding is stored on disk - /// to improve runtime performance - padding: u32, + /// Number of reclaimed pages from the `orphan_pages` list + reclaimed_orphans_len: u32, } impl MetadataSlot { @@ -251,24 +202,16 @@ impl MetadataSlot { #[inline] #[must_use] fn reclaimed_range(&self) -> Range { - self.reclaimed_orphans_start as usize..self.reclaimed_orphans_end as usize + 0..self.reclaimed_orphans_len as usize } /// Returns the range of pages that are actually orphans (not reclaimed) in the `orphan_pages` /// list. - /// - /// This returns 2 disjoint ranges because there may be some reclaimed pages in the middle. #[inline] #[must_use] - fn actual_orphans_ranges(&self) -> (Range, Range) { - debug_assert!( - self.orphan_pages_len >= self.reclaimed_orphans_start + self.reclaimed_orphans_end - ); - ( - 0..(self.reclaimed_orphans_start as usize), - (self.reclaimed_orphans_start as usize + self.reclaimed_orphans_end as usize).. - (self.orphan_pages_len as usize), - ) + fn actual_orphans_range(&self) -> Range { + debug_assert!(self.orphan_pages_len >= self.reclaimed_orphans_len); + self.reclaimed_orphans_len as usize..self.orphan_pages_len as usize } /// Computes the hash for this slot. @@ -310,11 +253,7 @@ impl HashedMetadataSlot { } // Check that the number of reclaimed pages doesn't exceed the total number of orphan // pages. - let reclaimed_orphans_end = self - .reclaimed_orphans_start - .checked_add(self.reclaimed_orphans_end) - .ok_or(CorruptedMetadataError)?; - if self.orphan_pages_len < reclaimed_orphans_end { + if self.orphan_pages_len < self.reclaimed_orphans_len { return Err(CorruptedMetadataError); } // Check the hash. @@ -559,11 +498,43 @@ impl MetadataManager { Ok(()) } + fn compact_orphans(&mut self) { + let (active, dirty, list) = self.parts_mut(); + let reclaimed_range = active.reclaimed_range(); + let actual_orphans_range = dirty.actual_orphans_range(); + + // If orphan page list has observed N pushes and M pops, then first of all note that: + // + // - N == actual_orphans_range.len() + // - M == reclaimed_range.len() + // - N - M = actual_orphans_range.len() + // - N >= M, or equivalently N - M >= 0 + // + // Here we move the orphans to the start of the list when M >= N - M, which implies N <= + // 2M. If that condition is satisfied, we will move N - M items. In total, if the condition + // is satisfied, we will have performed: + // + // - N pushes + // - M pops + // - N - M copies + // + // for a total of N + M + N - M = 2N operations, which means that adding items to the + // orphan page list still takes O(1) amortized time. + if reclaimed_range.len() >= actual_orphans_range.len() { + dirty.orphan_pages_len = actual_orphans_range.len() as u32; + dirty.reclaimed_orphans_len = 0; + list.copy_within(actual_orphans_range, 0); + } + } + /// Saves the metadata to the storage device, and promotes the dirty slot to the active slot. /// /// After calling this method, a new dirty slot is produced with the same contents as the new /// active slot, and an auto-incremented snapshot ID. pub fn commit(&mut self) -> io::Result<()> { + // Compact the orphan page list if there's enough room + self.compact_orphans(); + // First make sure the changes from the dirty slot are on disk self.dirty_slot_mut().update_hash(); debug_assert!(self.dirty_slot_mut().verify_integrity().is_ok()); @@ -622,7 +593,7 @@ impl<'a> OrphanPages<'a> { #[inline] pub fn len(&self) -> usize { let m = self.manager.dirty_slot(); - (m.orphan_pages_len as usize) - (m.reclaimed_orphans_end as usize) + (m.orphan_pages_len as usize) - (m.reclaimed_orphans_len as usize) } /// Maximum number of orphan pages that this list can contain without resizing. @@ -640,8 +611,8 @@ impl<'a> OrphanPages<'a> { /// Returns an iterator that yields the IDs of orphan pages. pub fn iter(&self) -> impl FusedIterator + use<'_> { let list = self.manager.raw_orphan_pages(); - let (left, right) = self.manager.dirty_slot().actual_orphans_ranges(); - list[right].iter().copied().chain(list[left].iter().copied()) + let range = self.manager.dirty_slot().actual_orphans_range(); + list[range].iter().copied() } /// Adds a page to the orphan page list, increasing the capacity of the list if necessary. @@ -656,43 +627,17 @@ impl<'a> OrphanPages<'a> { /// Adds a page to the orphan page list if there is enough capacity. pub fn push_within_capacity(&mut self, orphan: OrphanPage) -> Result<(), OrphanPage> { - // To make sure the previous snapshot is always valid, we cannot modify orphan pages that - // are referenced by the previous snapshot. We can only modify the reclaimed orphans - // slice, or the the additional orphans elements added at the end of the list (if any). - // - // Particular care should be taken because a `pop()` may be followed by a `push()`, and - // it's important that the `push()` does not overwrite data from the previous snapshot. - - let (active, dirty, list) = self.manager.parts_mut(); - - // Check if we can write in the reclaimed slice. We can only write in the intersection - // between the active reclaimed range and the dirty reclaimed range, and only at the bounds - // of the dirty reclaimed range. - let active_reclaimed_range = active.reclaimed_range(); - let dirty_reclaimed_range = dirty.reclaimed_range(); - let intersection = active_reclaimed_range.start.max(dirty_reclaimed_range.start).. - active_reclaimed_range.end.min(dirty_reclaimed_range.end); - - if !intersection.is_empty() { - if intersection.start == dirty_reclaimed_range.start { - list[dirty_reclaimed_range.start] = orphan; - dirty.reclaimed_orphans_start += 1; - dirty.reclaimed_orphans_end -= 1; - return Ok(()); - } else if intersection.end == dirty_reclaimed_range.end { - list[dirty_reclaimed_range.end - 1] = orphan; - dirty.reclaimed_orphans_end -= 1; - return Ok(()); - } - } + let (_, dirty, list) = self.manager.parts_mut(); - // We need to write at the end of the list. We can only write past the active and dirty - // list. - let (_, active_orphans_range) = active.actual_orphans_ranges(); - let (_, dirty_orphans_range) = dirty.actual_orphans_ranges(); - let index = active_orphans_range.end.max(dirty_orphans_range.end); - if index < list.len() { - list[index] = orphan; + let range = dirty.actual_orphans_range(); + if range.end < list.len() { + if range.end > 0 { + // In debug mode, ensure that the sequence of orphan page snapshot IDs are + // non-decreasing. This is because `pop()` makes this assumption, and if this + // assumption is broken, then the orphan page list may grow indefinetely. + debug_assert!(orphan.orphaned_at >= list[range.end - 1].orphaned_at); + } + list[range.end] = orphan; dirty.orphan_pages_len += 1; return Ok(()); } @@ -710,31 +655,12 @@ impl<'a> OrphanPages<'a> { /// exists. pub fn pop(&mut self, snapshot_threshold: SnapshotId) -> Option { let (_, dirty, list) = self.manager.parts_mut(); - let (left, right) = dirty.actual_orphans_ranges(); + let range = dirty.actual_orphans_range(); - // The following code checks the `left` and `right` ranges for orphaned pages that have an - // `orphaned_at()` equal or below `snapshot_threshold`. - // - // Instead of scanning the whole `left` and `right` lists, the code only check the boundary - // elements. The assumption is that snapshot IDs are always increasing, never decreasing, - // and therefore each call to `push()` always adds pages with an increasing - // `orphaned_at()`. So if the first element has an `orphaned_at()` that is already too - // high, there's no point in checking the other elements, because they will also be above - // the threshold. - - if !right.is_empty() { - let orphan = list[right.start]; + if !range.is_empty() { + let orphan = list[range.start]; if orphan.orphaned_at() <= snapshot_threshold { - dirty.reclaimed_orphans_end += 1; - return Some(orphan); - } - } - - if !left.is_empty() { - let orphan = list[left.end - 1]; - if orphan.orphaned_at() <= snapshot_threshold { - dirty.reclaimed_orphans_start -= 1; - dirty.reclaimed_orphans_end += 1; + dirty.reclaimed_orphans_len += 1; return Some(orphan); } } @@ -890,7 +816,7 @@ mod tests { } #[test] - fn push_pop() { + fn random_push_pop() { let f = tempfile::tempfile().expect("failed to open temporary file"); let mut manager = MetadataManager::from_file(f).expect("failed to initialize metadata manager"); @@ -911,6 +837,43 @@ mod tests { } } + #[test] + fn push_pop() { + let f = tempfile::tempfile().expect("failed to open temporary file"); + let mut manager = + MetadataManager::from_file(f).expect("failed to initialize metadata manager"); + + // Add 4 pages with increasing snapshots; the orphan page list will look like this: + // [1, 2, 3, 4] + manager.orphan_pages().push(OrphanPage::new(page_id!(1), 1)).expect("push failed"); + manager.orphan_pages().push(OrphanPage::new(page_id!(2), 2)).expect("push failed"); + manager.orphan_pages().push(OrphanPage::new(page_id!(3), 3)).expect("push failed"); + manager.orphan_pages().push(OrphanPage::new(page_id!(4), 4)).expect("push failed"); + manager.commit().expect("commit failed"); + + // Pop 3 pages; orphan page list: [(claimed), (claimed), (claimed), 4] + manager.orphan_pages().pop(3).expect("pop failed"); + manager.orphan_pages().pop(3).expect("pop failed"); + manager.orphan_pages().pop(3).expect("pop failed"); + manager.commit().expect("commit failed"); + + // Push 2 new pages, again with increasing snapshots; orphan page list: [5, 6, + // (claimed), 4] + manager.orphan_pages().push(OrphanPage::new(page_id!(5), 5)).expect("push failed"); + manager.orphan_pages().push(OrphanPage::new(page_id!(6), 6)).expect("push failed"); + manager.commit().expect("commit failed"); + + // Pop 2 pages; orphan page list: [(claimed), 6, (claimed), (claimed)] + manager.orphan_pages().pop(5).expect("pop failed"); + manager.orphan_pages().pop(5).expect("pop failed"); + manager.commit().expect("commit failed"); + + assert_eq!( + manager.orphan_pages().iter().map(|orphan| orphan.page_id).collect::>(), + [6] + ); + } + #[test] fn crash_recovery() { let f = tempfile::tempfile().expect("failed to open temporary file");