162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Copyright (C) 2008 Oracle. All rights reserved. 462306a36Sopenharmony_ci */ 562306a36Sopenharmony_ci 662306a36Sopenharmony_ci#include <linux/kernel.h> 762306a36Sopenharmony_ci#include <linux/bio.h> 862306a36Sopenharmony_ci#include <linux/file.h> 962306a36Sopenharmony_ci#include <linux/fs.h> 1062306a36Sopenharmony_ci#include <linux/pagemap.h> 1162306a36Sopenharmony_ci#include <linux/pagevec.h> 1262306a36Sopenharmony_ci#include <linux/highmem.h> 1362306a36Sopenharmony_ci#include <linux/kthread.h> 1462306a36Sopenharmony_ci#include <linux/time.h> 1562306a36Sopenharmony_ci#include <linux/init.h> 1662306a36Sopenharmony_ci#include <linux/string.h> 1762306a36Sopenharmony_ci#include <linux/backing-dev.h> 1862306a36Sopenharmony_ci#include <linux/writeback.h> 1962306a36Sopenharmony_ci#include <linux/psi.h> 2062306a36Sopenharmony_ci#include <linux/slab.h> 2162306a36Sopenharmony_ci#include <linux/sched/mm.h> 2262306a36Sopenharmony_ci#include <linux/log2.h> 2362306a36Sopenharmony_ci#include <crypto/hash.h> 2462306a36Sopenharmony_ci#include "misc.h" 2562306a36Sopenharmony_ci#include "ctree.h" 2662306a36Sopenharmony_ci#include "fs.h" 2762306a36Sopenharmony_ci#include "disk-io.h" 2862306a36Sopenharmony_ci#include "transaction.h" 2962306a36Sopenharmony_ci#include "btrfs_inode.h" 3062306a36Sopenharmony_ci#include "bio.h" 3162306a36Sopenharmony_ci#include "ordered-data.h" 3262306a36Sopenharmony_ci#include "compression.h" 3362306a36Sopenharmony_ci#include "extent_io.h" 3462306a36Sopenharmony_ci#include "extent_map.h" 3562306a36Sopenharmony_ci#include "subpage.h" 3662306a36Sopenharmony_ci#include "zoned.h" 3762306a36Sopenharmony_ci#include "file-item.h" 3862306a36Sopenharmony_ci#include "super.h" 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_cistatic struct bio_set btrfs_compressed_bioset; 4162306a36Sopenharmony_ci 4262306a36Sopenharmony_cistatic const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ciconst char* btrfs_compress_type2str(enum btrfs_compression_type type) 4562306a36Sopenharmony_ci{ 4662306a36Sopenharmony_ci switch (type) { 4762306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: 4862306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: 4962306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: 5062306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: 5162306a36Sopenharmony_ci return btrfs_compress_types[type]; 5262306a36Sopenharmony_ci default: 5362306a36Sopenharmony_ci break; 5462306a36Sopenharmony_ci } 5562306a36Sopenharmony_ci 5662306a36Sopenharmony_ci return NULL; 5762306a36Sopenharmony_ci} 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_cistatic inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio) 6062306a36Sopenharmony_ci{ 6162306a36Sopenharmony_ci return container_of(bbio, struct compressed_bio, bbio); 6262306a36Sopenharmony_ci} 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_cistatic struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, 6562306a36Sopenharmony_ci u64 start, blk_opf_t op, 6662306a36Sopenharmony_ci btrfs_bio_end_io_t end_io) 6762306a36Sopenharmony_ci{ 6862306a36Sopenharmony_ci struct btrfs_bio *bbio; 6962306a36Sopenharmony_ci 7062306a36Sopenharmony_ci bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, 7162306a36Sopenharmony_ci GFP_NOFS, &btrfs_compressed_bioset)); 7262306a36Sopenharmony_ci btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); 7362306a36Sopenharmony_ci bbio->inode = inode; 7462306a36Sopenharmony_ci bbio->file_offset = start; 7562306a36Sopenharmony_ci return to_compressed_bio(bbio); 7662306a36Sopenharmony_ci} 7762306a36Sopenharmony_ci 7862306a36Sopenharmony_cibool btrfs_compress_is_valid_type(const char *str, size_t len) 7962306a36Sopenharmony_ci{ 8062306a36Sopenharmony_ci int i; 8162306a36Sopenharmony_ci 8262306a36Sopenharmony_ci for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { 8362306a36Sopenharmony_ci size_t comp_len = strlen(btrfs_compress_types[i]); 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci if (len < comp_len) 8662306a36Sopenharmony_ci continue; 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci if (!strncmp(btrfs_compress_types[i], str, comp_len)) 8962306a36Sopenharmony_ci return true; 9062306a36Sopenharmony_ci } 9162306a36Sopenharmony_ci return false; 9262306a36Sopenharmony_ci} 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cistatic int compression_compress_pages(int type, struct list_head *ws, 9562306a36Sopenharmony_ci struct address_space *mapping, u64 start, struct page **pages, 9662306a36Sopenharmony_ci unsigned long *out_pages, unsigned long *total_in, 9762306a36Sopenharmony_ci unsigned long *total_out) 9862306a36Sopenharmony_ci{ 9962306a36Sopenharmony_ci switch (type) { 10062306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: 10162306a36Sopenharmony_ci return zlib_compress_pages(ws, mapping, start, pages, 10262306a36Sopenharmony_ci out_pages, total_in, total_out); 10362306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: 10462306a36Sopenharmony_ci return lzo_compress_pages(ws, mapping, start, pages, 10562306a36Sopenharmony_ci out_pages, total_in, total_out); 10662306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: 10762306a36Sopenharmony_ci return zstd_compress_pages(ws, mapping, start, pages, 10862306a36Sopenharmony_ci out_pages, total_in, total_out); 10962306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: 11062306a36Sopenharmony_ci default: 11162306a36Sopenharmony_ci /* 11262306a36Sopenharmony_ci * This can happen when compression races with remount setting 11362306a36Sopenharmony_ci * it to 'no compress', while caller doesn't call 11462306a36Sopenharmony_ci * inode_need_compress() to check if we really need to 11562306a36Sopenharmony_ci * compress. 11662306a36Sopenharmony_ci * 11762306a36Sopenharmony_ci * Not a big deal, just need to inform caller that we 11862306a36Sopenharmony_ci * haven't allocated any pages yet. 11962306a36Sopenharmony_ci */ 12062306a36Sopenharmony_ci *out_pages = 0; 12162306a36Sopenharmony_ci return -E2BIG; 12262306a36Sopenharmony_ci } 12362306a36Sopenharmony_ci} 12462306a36Sopenharmony_ci 12562306a36Sopenharmony_cistatic int compression_decompress_bio(struct list_head *ws, 12662306a36Sopenharmony_ci struct compressed_bio *cb) 12762306a36Sopenharmony_ci{ 12862306a36Sopenharmony_ci switch (cb->compress_type) { 12962306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); 13062306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); 13162306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); 13262306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: 13362306a36Sopenharmony_ci default: 13462306a36Sopenharmony_ci /* 13562306a36Sopenharmony_ci * This can't happen, the type is validated several times 13662306a36Sopenharmony_ci * before we get here. 13762306a36Sopenharmony_ci */ 13862306a36Sopenharmony_ci BUG(); 13962306a36Sopenharmony_ci } 14062306a36Sopenharmony_ci} 14162306a36Sopenharmony_ci 14262306a36Sopenharmony_cistatic int compression_decompress(int type, struct list_head *ws, 14362306a36Sopenharmony_ci const u8 *data_in, struct page *dest_page, 14462306a36Sopenharmony_ci unsigned long start_byte, size_t srclen, size_t destlen) 14562306a36Sopenharmony_ci{ 14662306a36Sopenharmony_ci switch (type) { 14762306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, 14862306a36Sopenharmony_ci start_byte, srclen, destlen); 14962306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, 15062306a36Sopenharmony_ci start_byte, srclen, destlen); 15162306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, 15262306a36Sopenharmony_ci start_byte, srclen, destlen); 15362306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: 15462306a36Sopenharmony_ci default: 15562306a36Sopenharmony_ci /* 15662306a36Sopenharmony_ci * This can't happen, the type is validated several times 15762306a36Sopenharmony_ci * before we get here. 15862306a36Sopenharmony_ci */ 15962306a36Sopenharmony_ci BUG(); 16062306a36Sopenharmony_ci } 16162306a36Sopenharmony_ci} 16262306a36Sopenharmony_ci 16362306a36Sopenharmony_cistatic void btrfs_free_compressed_pages(struct compressed_bio *cb) 16462306a36Sopenharmony_ci{ 16562306a36Sopenharmony_ci for (unsigned int i = 0; i < cb->nr_pages; i++) 16662306a36Sopenharmony_ci put_page(cb->compressed_pages[i]); 16762306a36Sopenharmony_ci kfree(cb->compressed_pages); 16862306a36Sopenharmony_ci} 16962306a36Sopenharmony_ci 17062306a36Sopenharmony_cistatic int btrfs_decompress_bio(struct compressed_bio *cb); 17162306a36Sopenharmony_ci 17262306a36Sopenharmony_cistatic void end_compressed_bio_read(struct btrfs_bio *bbio) 17362306a36Sopenharmony_ci{ 17462306a36Sopenharmony_ci struct compressed_bio *cb = to_compressed_bio(bbio); 17562306a36Sopenharmony_ci blk_status_t status = bbio->bio.bi_status; 17662306a36Sopenharmony_ci 17762306a36Sopenharmony_ci if (!status) 17862306a36Sopenharmony_ci status = errno_to_blk_status(btrfs_decompress_bio(cb)); 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci btrfs_free_compressed_pages(cb); 18162306a36Sopenharmony_ci btrfs_bio_end_io(cb->orig_bbio, status); 18262306a36Sopenharmony_ci bio_put(&bbio->bio); 18362306a36Sopenharmony_ci} 18462306a36Sopenharmony_ci 18562306a36Sopenharmony_ci/* 18662306a36Sopenharmony_ci * Clear the writeback bits on all of the file 18762306a36Sopenharmony_ci * pages for a compressed write 18862306a36Sopenharmony_ci */ 18962306a36Sopenharmony_cistatic noinline void end_compressed_writeback(const struct compressed_bio *cb) 19062306a36Sopenharmony_ci{ 19162306a36Sopenharmony_ci struct inode *inode = &cb->bbio.inode->vfs_inode; 19262306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 19362306a36Sopenharmony_ci unsigned long index = cb->start >> PAGE_SHIFT; 19462306a36Sopenharmony_ci unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; 19562306a36Sopenharmony_ci struct folio_batch fbatch; 19662306a36Sopenharmony_ci const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); 19762306a36Sopenharmony_ci int i; 19862306a36Sopenharmony_ci int ret; 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci if (errno) 20162306a36Sopenharmony_ci mapping_set_error(inode->i_mapping, errno); 20262306a36Sopenharmony_ci 20362306a36Sopenharmony_ci folio_batch_init(&fbatch); 20462306a36Sopenharmony_ci while (index <= end_index) { 20562306a36Sopenharmony_ci ret = filemap_get_folios(inode->i_mapping, &index, end_index, 20662306a36Sopenharmony_ci &fbatch); 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci if (ret == 0) 20962306a36Sopenharmony_ci return; 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci for (i = 0; i < ret; i++) { 21262306a36Sopenharmony_ci struct folio *folio = fbatch.folios[i]; 21362306a36Sopenharmony_ci 21462306a36Sopenharmony_ci btrfs_page_clamp_clear_writeback(fs_info, &folio->page, 21562306a36Sopenharmony_ci cb->start, cb->len); 21662306a36Sopenharmony_ci } 21762306a36Sopenharmony_ci folio_batch_release(&fbatch); 21862306a36Sopenharmony_ci } 21962306a36Sopenharmony_ci /* the inode may be gone now */ 22062306a36Sopenharmony_ci} 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_cistatic void btrfs_finish_compressed_write_work(struct work_struct *work) 22362306a36Sopenharmony_ci{ 22462306a36Sopenharmony_ci struct compressed_bio *cb = 22562306a36Sopenharmony_ci container_of(work, struct compressed_bio, write_end_work); 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ci btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, 22862306a36Sopenharmony_ci cb->bbio.bio.bi_status == BLK_STS_OK); 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci if (cb->writeback) 23162306a36Sopenharmony_ci end_compressed_writeback(cb); 23262306a36Sopenharmony_ci /* Note, our inode could be gone now */ 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_ci btrfs_free_compressed_pages(cb); 23562306a36Sopenharmony_ci bio_put(&cb->bbio.bio); 23662306a36Sopenharmony_ci} 23762306a36Sopenharmony_ci 23862306a36Sopenharmony_ci/* 23962306a36Sopenharmony_ci * Do the cleanup once all the compressed pages hit the disk. This will clear 24062306a36Sopenharmony_ci * writeback on the file pages and free the compressed pages. 24162306a36Sopenharmony_ci * 24262306a36Sopenharmony_ci * This also calls the writeback end hooks for the file pages so that metadata 24362306a36Sopenharmony_ci * and checksums can be updated in the file. 24462306a36Sopenharmony_ci */ 24562306a36Sopenharmony_cistatic void end_compressed_bio_write(struct btrfs_bio *bbio) 24662306a36Sopenharmony_ci{ 24762306a36Sopenharmony_ci struct compressed_bio *cb = to_compressed_bio(bbio); 24862306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; 24962306a36Sopenharmony_ci 25062306a36Sopenharmony_ci queue_work(fs_info->compressed_write_workers, &cb->write_end_work); 25162306a36Sopenharmony_ci} 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_cistatic void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) 25462306a36Sopenharmony_ci{ 25562306a36Sopenharmony_ci struct bio *bio = &cb->bbio.bio; 25662306a36Sopenharmony_ci u32 offset = 0; 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_ci while (offset < cb->compressed_len) { 25962306a36Sopenharmony_ci u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); 26062306a36Sopenharmony_ci 26162306a36Sopenharmony_ci /* Maximum compressed extent is smaller than bio size limit. */ 26262306a36Sopenharmony_ci __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], 26362306a36Sopenharmony_ci len, 0); 26462306a36Sopenharmony_ci offset += len; 26562306a36Sopenharmony_ci } 26662306a36Sopenharmony_ci} 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci/* 26962306a36Sopenharmony_ci * worker function to build and submit bios for previously compressed pages. 27062306a36Sopenharmony_ci * The corresponding pages in the inode should be marked for writeback 27162306a36Sopenharmony_ci * and the compressed pages should have a reference on them for dropping 27262306a36Sopenharmony_ci * when the IO is complete. 27362306a36Sopenharmony_ci * 27462306a36Sopenharmony_ci * This also checksums the file bytes and gets things ready for 27562306a36Sopenharmony_ci * the end io hooks. 27662306a36Sopenharmony_ci */ 27762306a36Sopenharmony_civoid btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, 27862306a36Sopenharmony_ci struct page **compressed_pages, 27962306a36Sopenharmony_ci unsigned int nr_pages, 28062306a36Sopenharmony_ci blk_opf_t write_flags, 28162306a36Sopenharmony_ci bool writeback) 28262306a36Sopenharmony_ci{ 28362306a36Sopenharmony_ci struct btrfs_inode *inode = BTRFS_I(ordered->inode); 28462306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 28562306a36Sopenharmony_ci struct compressed_bio *cb; 28662306a36Sopenharmony_ci 28762306a36Sopenharmony_ci ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); 28862306a36Sopenharmony_ci ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); 28962306a36Sopenharmony_ci 29062306a36Sopenharmony_ci cb = alloc_compressed_bio(inode, ordered->file_offset, 29162306a36Sopenharmony_ci REQ_OP_WRITE | write_flags, 29262306a36Sopenharmony_ci end_compressed_bio_write); 29362306a36Sopenharmony_ci cb->start = ordered->file_offset; 29462306a36Sopenharmony_ci cb->len = ordered->num_bytes; 29562306a36Sopenharmony_ci cb->compressed_pages = compressed_pages; 29662306a36Sopenharmony_ci cb->compressed_len = ordered->disk_num_bytes; 29762306a36Sopenharmony_ci cb->writeback = writeback; 29862306a36Sopenharmony_ci INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); 29962306a36Sopenharmony_ci cb->nr_pages = nr_pages; 30062306a36Sopenharmony_ci cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; 30162306a36Sopenharmony_ci cb->bbio.ordered = ordered; 30262306a36Sopenharmony_ci btrfs_add_compressed_bio_pages(cb); 30362306a36Sopenharmony_ci 30462306a36Sopenharmony_ci btrfs_submit_bio(&cb->bbio, 0); 30562306a36Sopenharmony_ci} 30662306a36Sopenharmony_ci 30762306a36Sopenharmony_ci/* 30862306a36Sopenharmony_ci * Add extra pages in the same compressed file extent so that we don't need to 30962306a36Sopenharmony_ci * re-read the same extent again and again. 31062306a36Sopenharmony_ci * 31162306a36Sopenharmony_ci * NOTE: this won't work well for subpage, as for subpage read, we lock the 31262306a36Sopenharmony_ci * full page then submit bio for each compressed/regular extents. 31362306a36Sopenharmony_ci * 31462306a36Sopenharmony_ci * This means, if we have several sectors in the same page points to the same 31562306a36Sopenharmony_ci * on-disk compressed data, we will re-read the same extent many times and 31662306a36Sopenharmony_ci * this function can only help for the next page. 31762306a36Sopenharmony_ci */ 31862306a36Sopenharmony_cistatic noinline int add_ra_bio_pages(struct inode *inode, 31962306a36Sopenharmony_ci u64 compressed_end, 32062306a36Sopenharmony_ci struct compressed_bio *cb, 32162306a36Sopenharmony_ci int *memstall, unsigned long *pflags) 32262306a36Sopenharmony_ci{ 32362306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 32462306a36Sopenharmony_ci unsigned long end_index; 32562306a36Sopenharmony_ci struct bio *orig_bio = &cb->orig_bbio->bio; 32662306a36Sopenharmony_ci u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; 32762306a36Sopenharmony_ci u64 isize = i_size_read(inode); 32862306a36Sopenharmony_ci int ret; 32962306a36Sopenharmony_ci struct page *page; 33062306a36Sopenharmony_ci struct extent_map *em; 33162306a36Sopenharmony_ci struct address_space *mapping = inode->i_mapping; 33262306a36Sopenharmony_ci struct extent_map_tree *em_tree; 33362306a36Sopenharmony_ci struct extent_io_tree *tree; 33462306a36Sopenharmony_ci int sectors_missed = 0; 33562306a36Sopenharmony_ci 33662306a36Sopenharmony_ci em_tree = &BTRFS_I(inode)->extent_tree; 33762306a36Sopenharmony_ci tree = &BTRFS_I(inode)->io_tree; 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci if (isize == 0) 34062306a36Sopenharmony_ci return 0; 34162306a36Sopenharmony_ci 34262306a36Sopenharmony_ci /* 34362306a36Sopenharmony_ci * For current subpage support, we only support 64K page size, 34462306a36Sopenharmony_ci * which means maximum compressed extent size (128K) is just 2x page 34562306a36Sopenharmony_ci * size. 34662306a36Sopenharmony_ci * This makes readahead less effective, so here disable readahead for 34762306a36Sopenharmony_ci * subpage for now, until full compressed write is supported. 34862306a36Sopenharmony_ci */ 34962306a36Sopenharmony_ci if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) 35062306a36Sopenharmony_ci return 0; 35162306a36Sopenharmony_ci 35262306a36Sopenharmony_ci end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci while (cur < compressed_end) { 35562306a36Sopenharmony_ci u64 page_end; 35662306a36Sopenharmony_ci u64 pg_index = cur >> PAGE_SHIFT; 35762306a36Sopenharmony_ci u32 add_size; 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci if (pg_index > end_index) 36062306a36Sopenharmony_ci break; 36162306a36Sopenharmony_ci 36262306a36Sopenharmony_ci page = xa_load(&mapping->i_pages, pg_index); 36362306a36Sopenharmony_ci if (page && !xa_is_value(page)) { 36462306a36Sopenharmony_ci sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> 36562306a36Sopenharmony_ci fs_info->sectorsize_bits; 36662306a36Sopenharmony_ci 36762306a36Sopenharmony_ci /* Beyond threshold, no need to continue */ 36862306a36Sopenharmony_ci if (sectors_missed > 4) 36962306a36Sopenharmony_ci break; 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci /* 37262306a36Sopenharmony_ci * Jump to next page start as we already have page for 37362306a36Sopenharmony_ci * current offset. 37462306a36Sopenharmony_ci */ 37562306a36Sopenharmony_ci cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; 37662306a36Sopenharmony_ci continue; 37762306a36Sopenharmony_ci } 37862306a36Sopenharmony_ci 37962306a36Sopenharmony_ci page = __page_cache_alloc(mapping_gfp_constraint(mapping, 38062306a36Sopenharmony_ci ~__GFP_FS)); 38162306a36Sopenharmony_ci if (!page) 38262306a36Sopenharmony_ci break; 38362306a36Sopenharmony_ci 38462306a36Sopenharmony_ci if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { 38562306a36Sopenharmony_ci put_page(page); 38662306a36Sopenharmony_ci /* There is already a page, skip to page end */ 38762306a36Sopenharmony_ci cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; 38862306a36Sopenharmony_ci continue; 38962306a36Sopenharmony_ci } 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci if (!*memstall && PageWorkingset(page)) { 39262306a36Sopenharmony_ci psi_memstall_enter(pflags); 39362306a36Sopenharmony_ci *memstall = 1; 39462306a36Sopenharmony_ci } 39562306a36Sopenharmony_ci 39662306a36Sopenharmony_ci ret = set_page_extent_mapped(page); 39762306a36Sopenharmony_ci if (ret < 0) { 39862306a36Sopenharmony_ci unlock_page(page); 39962306a36Sopenharmony_ci put_page(page); 40062306a36Sopenharmony_ci break; 40162306a36Sopenharmony_ci } 40262306a36Sopenharmony_ci 40362306a36Sopenharmony_ci page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; 40462306a36Sopenharmony_ci lock_extent(tree, cur, page_end, NULL); 40562306a36Sopenharmony_ci read_lock(&em_tree->lock); 40662306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); 40762306a36Sopenharmony_ci read_unlock(&em_tree->lock); 40862306a36Sopenharmony_ci 40962306a36Sopenharmony_ci /* 41062306a36Sopenharmony_ci * At this point, we have a locked page in the page cache for 41162306a36Sopenharmony_ci * these bytes in the file. But, we have to make sure they map 41262306a36Sopenharmony_ci * to this compressed extent on disk. 41362306a36Sopenharmony_ci */ 41462306a36Sopenharmony_ci if (!em || cur < em->start || 41562306a36Sopenharmony_ci (cur + fs_info->sectorsize > extent_map_end(em)) || 41662306a36Sopenharmony_ci (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { 41762306a36Sopenharmony_ci free_extent_map(em); 41862306a36Sopenharmony_ci unlock_extent(tree, cur, page_end, NULL); 41962306a36Sopenharmony_ci unlock_page(page); 42062306a36Sopenharmony_ci put_page(page); 42162306a36Sopenharmony_ci break; 42262306a36Sopenharmony_ci } 42362306a36Sopenharmony_ci free_extent_map(em); 42462306a36Sopenharmony_ci 42562306a36Sopenharmony_ci if (page->index == end_index) { 42662306a36Sopenharmony_ci size_t zero_offset = offset_in_page(isize); 42762306a36Sopenharmony_ci 42862306a36Sopenharmony_ci if (zero_offset) { 42962306a36Sopenharmony_ci int zeros; 43062306a36Sopenharmony_ci zeros = PAGE_SIZE - zero_offset; 43162306a36Sopenharmony_ci memzero_page(page, zero_offset, zeros); 43262306a36Sopenharmony_ci } 43362306a36Sopenharmony_ci } 43462306a36Sopenharmony_ci 43562306a36Sopenharmony_ci add_size = min(em->start + em->len, page_end + 1) - cur; 43662306a36Sopenharmony_ci ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); 43762306a36Sopenharmony_ci if (ret != add_size) { 43862306a36Sopenharmony_ci unlock_extent(tree, cur, page_end, NULL); 43962306a36Sopenharmony_ci unlock_page(page); 44062306a36Sopenharmony_ci put_page(page); 44162306a36Sopenharmony_ci break; 44262306a36Sopenharmony_ci } 44362306a36Sopenharmony_ci /* 44462306a36Sopenharmony_ci * If it's subpage, we also need to increase its 44562306a36Sopenharmony_ci * subpage::readers number, as at endio we will decrease 44662306a36Sopenharmony_ci * subpage::readers and to unlock the page. 44762306a36Sopenharmony_ci */ 44862306a36Sopenharmony_ci if (fs_info->sectorsize < PAGE_SIZE) 44962306a36Sopenharmony_ci btrfs_subpage_start_reader(fs_info, page, cur, add_size); 45062306a36Sopenharmony_ci put_page(page); 45162306a36Sopenharmony_ci cur += add_size; 45262306a36Sopenharmony_ci } 45362306a36Sopenharmony_ci return 0; 45462306a36Sopenharmony_ci} 45562306a36Sopenharmony_ci 45662306a36Sopenharmony_ci/* 45762306a36Sopenharmony_ci * for a compressed read, the bio we get passed has all the inode pages 45862306a36Sopenharmony_ci * in it. We don't actually do IO on those pages but allocate new ones 45962306a36Sopenharmony_ci * to hold the compressed pages on disk. 46062306a36Sopenharmony_ci * 46162306a36Sopenharmony_ci * bio->bi_iter.bi_sector points to the compressed extent on disk 46262306a36Sopenharmony_ci * bio->bi_io_vec points to all of the inode pages 46362306a36Sopenharmony_ci * 46462306a36Sopenharmony_ci * After the compressed pages are read, we copy the bytes into the 46562306a36Sopenharmony_ci * bio we were passed and then call the bio end_io calls 46662306a36Sopenharmony_ci */ 46762306a36Sopenharmony_civoid btrfs_submit_compressed_read(struct btrfs_bio *bbio) 46862306a36Sopenharmony_ci{ 46962306a36Sopenharmony_ci struct btrfs_inode *inode = bbio->inode; 47062306a36Sopenharmony_ci struct btrfs_fs_info *fs_info = inode->root->fs_info; 47162306a36Sopenharmony_ci struct extent_map_tree *em_tree = &inode->extent_tree; 47262306a36Sopenharmony_ci struct compressed_bio *cb; 47362306a36Sopenharmony_ci unsigned int compressed_len; 47462306a36Sopenharmony_ci u64 file_offset = bbio->file_offset; 47562306a36Sopenharmony_ci u64 em_len; 47662306a36Sopenharmony_ci u64 em_start; 47762306a36Sopenharmony_ci struct extent_map *em; 47862306a36Sopenharmony_ci unsigned long pflags; 47962306a36Sopenharmony_ci int memstall = 0; 48062306a36Sopenharmony_ci blk_status_t ret; 48162306a36Sopenharmony_ci int ret2; 48262306a36Sopenharmony_ci 48362306a36Sopenharmony_ci /* we need the actual starting offset of this extent in the file */ 48462306a36Sopenharmony_ci read_lock(&em_tree->lock); 48562306a36Sopenharmony_ci em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); 48662306a36Sopenharmony_ci read_unlock(&em_tree->lock); 48762306a36Sopenharmony_ci if (!em) { 48862306a36Sopenharmony_ci ret = BLK_STS_IOERR; 48962306a36Sopenharmony_ci goto out; 49062306a36Sopenharmony_ci } 49162306a36Sopenharmony_ci 49262306a36Sopenharmony_ci ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); 49362306a36Sopenharmony_ci compressed_len = em->block_len; 49462306a36Sopenharmony_ci 49562306a36Sopenharmony_ci cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, 49662306a36Sopenharmony_ci end_compressed_bio_read); 49762306a36Sopenharmony_ci 49862306a36Sopenharmony_ci cb->start = em->orig_start; 49962306a36Sopenharmony_ci em_len = em->len; 50062306a36Sopenharmony_ci em_start = em->start; 50162306a36Sopenharmony_ci 50262306a36Sopenharmony_ci cb->len = bbio->bio.bi_iter.bi_size; 50362306a36Sopenharmony_ci cb->compressed_len = compressed_len; 50462306a36Sopenharmony_ci cb->compress_type = em->compress_type; 50562306a36Sopenharmony_ci cb->orig_bbio = bbio; 50662306a36Sopenharmony_ci 50762306a36Sopenharmony_ci free_extent_map(em); 50862306a36Sopenharmony_ci 50962306a36Sopenharmony_ci cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 51062306a36Sopenharmony_ci cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); 51162306a36Sopenharmony_ci if (!cb->compressed_pages) { 51262306a36Sopenharmony_ci ret = BLK_STS_RESOURCE; 51362306a36Sopenharmony_ci goto out_free_bio; 51462306a36Sopenharmony_ci } 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); 51762306a36Sopenharmony_ci if (ret2) { 51862306a36Sopenharmony_ci ret = BLK_STS_RESOURCE; 51962306a36Sopenharmony_ci goto out_free_compressed_pages; 52062306a36Sopenharmony_ci } 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, 52362306a36Sopenharmony_ci &pflags); 52462306a36Sopenharmony_ci 52562306a36Sopenharmony_ci /* include any pages we added in add_ra-bio_pages */ 52662306a36Sopenharmony_ci cb->len = bbio->bio.bi_iter.bi_size; 52762306a36Sopenharmony_ci cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; 52862306a36Sopenharmony_ci btrfs_add_compressed_bio_pages(cb); 52962306a36Sopenharmony_ci 53062306a36Sopenharmony_ci if (memstall) 53162306a36Sopenharmony_ci psi_memstall_leave(&pflags); 53262306a36Sopenharmony_ci 53362306a36Sopenharmony_ci btrfs_submit_bio(&cb->bbio, 0); 53462306a36Sopenharmony_ci return; 53562306a36Sopenharmony_ci 53662306a36Sopenharmony_ciout_free_compressed_pages: 53762306a36Sopenharmony_ci kfree(cb->compressed_pages); 53862306a36Sopenharmony_ciout_free_bio: 53962306a36Sopenharmony_ci bio_put(&cb->bbio.bio); 54062306a36Sopenharmony_ciout: 54162306a36Sopenharmony_ci btrfs_bio_end_io(bbio, ret); 54262306a36Sopenharmony_ci} 54362306a36Sopenharmony_ci 54462306a36Sopenharmony_ci/* 54562306a36Sopenharmony_ci * Heuristic uses systematic sampling to collect data from the input data 54662306a36Sopenharmony_ci * range, the logic can be tuned by the following constants: 54762306a36Sopenharmony_ci * 54862306a36Sopenharmony_ci * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample 54962306a36Sopenharmony_ci * @SAMPLING_INTERVAL - range from which the sampled data can be collected 55062306a36Sopenharmony_ci */ 55162306a36Sopenharmony_ci#define SAMPLING_READ_SIZE (16) 55262306a36Sopenharmony_ci#define SAMPLING_INTERVAL (256) 55362306a36Sopenharmony_ci 55462306a36Sopenharmony_ci/* 55562306a36Sopenharmony_ci * For statistical analysis of the input data we consider bytes that form a 55662306a36Sopenharmony_ci * Galois Field of 256 objects. Each object has an attribute count, ie. how 55762306a36Sopenharmony_ci * many times the object appeared in the sample. 55862306a36Sopenharmony_ci */ 55962306a36Sopenharmony_ci#define BUCKET_SIZE (256) 56062306a36Sopenharmony_ci 56162306a36Sopenharmony_ci/* 56262306a36Sopenharmony_ci * The size of the sample is based on a statistical sampling rule of thumb. 56362306a36Sopenharmony_ci * The common way is to perform sampling tests as long as the number of 56462306a36Sopenharmony_ci * elements in each cell is at least 5. 56562306a36Sopenharmony_ci * 56662306a36Sopenharmony_ci * Instead of 5, we choose 32 to obtain more accurate results. 56762306a36Sopenharmony_ci * If the data contain the maximum number of symbols, which is 256, we obtain a 56862306a36Sopenharmony_ci * sample size bound by 8192. 56962306a36Sopenharmony_ci * 57062306a36Sopenharmony_ci * For a sample of at most 8KB of data per data range: 16 consecutive bytes 57162306a36Sopenharmony_ci * from up to 512 locations. 57262306a36Sopenharmony_ci */ 57362306a36Sopenharmony_ci#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ 57462306a36Sopenharmony_ci SAMPLING_READ_SIZE / SAMPLING_INTERVAL) 57562306a36Sopenharmony_ci 57662306a36Sopenharmony_cistruct bucket_item { 57762306a36Sopenharmony_ci u32 count; 57862306a36Sopenharmony_ci}; 57962306a36Sopenharmony_ci 58062306a36Sopenharmony_cistruct heuristic_ws { 58162306a36Sopenharmony_ci /* Partial copy of input data */ 58262306a36Sopenharmony_ci u8 *sample; 58362306a36Sopenharmony_ci u32 sample_size; 58462306a36Sopenharmony_ci /* Buckets store counters for each byte value */ 58562306a36Sopenharmony_ci struct bucket_item *bucket; 58662306a36Sopenharmony_ci /* Sorting buffer */ 58762306a36Sopenharmony_ci struct bucket_item *bucket_b; 58862306a36Sopenharmony_ci struct list_head list; 58962306a36Sopenharmony_ci}; 59062306a36Sopenharmony_ci 59162306a36Sopenharmony_cistatic struct workspace_manager heuristic_wsm; 59262306a36Sopenharmony_ci 59362306a36Sopenharmony_cistatic void free_heuristic_ws(struct list_head *ws) 59462306a36Sopenharmony_ci{ 59562306a36Sopenharmony_ci struct heuristic_ws *workspace; 59662306a36Sopenharmony_ci 59762306a36Sopenharmony_ci workspace = list_entry(ws, struct heuristic_ws, list); 59862306a36Sopenharmony_ci 59962306a36Sopenharmony_ci kvfree(workspace->sample); 60062306a36Sopenharmony_ci kfree(workspace->bucket); 60162306a36Sopenharmony_ci kfree(workspace->bucket_b); 60262306a36Sopenharmony_ci kfree(workspace); 60362306a36Sopenharmony_ci} 60462306a36Sopenharmony_ci 60562306a36Sopenharmony_cistatic struct list_head *alloc_heuristic_ws(unsigned int level) 60662306a36Sopenharmony_ci{ 60762306a36Sopenharmony_ci struct heuristic_ws *ws; 60862306a36Sopenharmony_ci 60962306a36Sopenharmony_ci ws = kzalloc(sizeof(*ws), GFP_KERNEL); 61062306a36Sopenharmony_ci if (!ws) 61162306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 61262306a36Sopenharmony_ci 61362306a36Sopenharmony_ci ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); 61462306a36Sopenharmony_ci if (!ws->sample) 61562306a36Sopenharmony_ci goto fail; 61662306a36Sopenharmony_ci 61762306a36Sopenharmony_ci ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); 61862306a36Sopenharmony_ci if (!ws->bucket) 61962306a36Sopenharmony_ci goto fail; 62062306a36Sopenharmony_ci 62162306a36Sopenharmony_ci ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); 62262306a36Sopenharmony_ci if (!ws->bucket_b) 62362306a36Sopenharmony_ci goto fail; 62462306a36Sopenharmony_ci 62562306a36Sopenharmony_ci INIT_LIST_HEAD(&ws->list); 62662306a36Sopenharmony_ci return &ws->list; 62762306a36Sopenharmony_cifail: 62862306a36Sopenharmony_ci free_heuristic_ws(&ws->list); 62962306a36Sopenharmony_ci return ERR_PTR(-ENOMEM); 63062306a36Sopenharmony_ci} 63162306a36Sopenharmony_ci 63262306a36Sopenharmony_ciconst struct btrfs_compress_op btrfs_heuristic_compress = { 63362306a36Sopenharmony_ci .workspace_manager = &heuristic_wsm, 63462306a36Sopenharmony_ci}; 63562306a36Sopenharmony_ci 63662306a36Sopenharmony_cistatic const struct btrfs_compress_op * const btrfs_compress_op[] = { 63762306a36Sopenharmony_ci /* The heuristic is represented as compression type 0 */ 63862306a36Sopenharmony_ci &btrfs_heuristic_compress, 63962306a36Sopenharmony_ci &btrfs_zlib_compress, 64062306a36Sopenharmony_ci &btrfs_lzo_compress, 64162306a36Sopenharmony_ci &btrfs_zstd_compress, 64262306a36Sopenharmony_ci}; 64362306a36Sopenharmony_ci 64462306a36Sopenharmony_cistatic struct list_head *alloc_workspace(int type, unsigned int level) 64562306a36Sopenharmony_ci{ 64662306a36Sopenharmony_ci switch (type) { 64762306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); 64862306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); 64962306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); 65062306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); 65162306a36Sopenharmony_ci default: 65262306a36Sopenharmony_ci /* 65362306a36Sopenharmony_ci * This can't happen, the type is validated several times 65462306a36Sopenharmony_ci * before we get here. 65562306a36Sopenharmony_ci */ 65662306a36Sopenharmony_ci BUG(); 65762306a36Sopenharmony_ci } 65862306a36Sopenharmony_ci} 65962306a36Sopenharmony_ci 66062306a36Sopenharmony_cistatic void free_workspace(int type, struct list_head *ws) 66162306a36Sopenharmony_ci{ 66262306a36Sopenharmony_ci switch (type) { 66362306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); 66462306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); 66562306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); 66662306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); 66762306a36Sopenharmony_ci default: 66862306a36Sopenharmony_ci /* 66962306a36Sopenharmony_ci * This can't happen, the type is validated several times 67062306a36Sopenharmony_ci * before we get here. 67162306a36Sopenharmony_ci */ 67262306a36Sopenharmony_ci BUG(); 67362306a36Sopenharmony_ci } 67462306a36Sopenharmony_ci} 67562306a36Sopenharmony_ci 67662306a36Sopenharmony_cistatic void btrfs_init_workspace_manager(int type) 67762306a36Sopenharmony_ci{ 67862306a36Sopenharmony_ci struct workspace_manager *wsm; 67962306a36Sopenharmony_ci struct list_head *workspace; 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci wsm = btrfs_compress_op[type]->workspace_manager; 68262306a36Sopenharmony_ci INIT_LIST_HEAD(&wsm->idle_ws); 68362306a36Sopenharmony_ci spin_lock_init(&wsm->ws_lock); 68462306a36Sopenharmony_ci atomic_set(&wsm->total_ws, 0); 68562306a36Sopenharmony_ci init_waitqueue_head(&wsm->ws_wait); 68662306a36Sopenharmony_ci 68762306a36Sopenharmony_ci /* 68862306a36Sopenharmony_ci * Preallocate one workspace for each compression type so we can 68962306a36Sopenharmony_ci * guarantee forward progress in the worst case 69062306a36Sopenharmony_ci */ 69162306a36Sopenharmony_ci workspace = alloc_workspace(type, 0); 69262306a36Sopenharmony_ci if (IS_ERR(workspace)) { 69362306a36Sopenharmony_ci pr_warn( 69462306a36Sopenharmony_ci "BTRFS: cannot preallocate compression workspace, will try later\n"); 69562306a36Sopenharmony_ci } else { 69662306a36Sopenharmony_ci atomic_set(&wsm->total_ws, 1); 69762306a36Sopenharmony_ci wsm->free_ws = 1; 69862306a36Sopenharmony_ci list_add(workspace, &wsm->idle_ws); 69962306a36Sopenharmony_ci } 70062306a36Sopenharmony_ci} 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_cistatic void btrfs_cleanup_workspace_manager(int type) 70362306a36Sopenharmony_ci{ 70462306a36Sopenharmony_ci struct workspace_manager *wsman; 70562306a36Sopenharmony_ci struct list_head *ws; 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci wsman = btrfs_compress_op[type]->workspace_manager; 70862306a36Sopenharmony_ci while (!list_empty(&wsman->idle_ws)) { 70962306a36Sopenharmony_ci ws = wsman->idle_ws.next; 71062306a36Sopenharmony_ci list_del(ws); 71162306a36Sopenharmony_ci free_workspace(type, ws); 71262306a36Sopenharmony_ci atomic_dec(&wsman->total_ws); 71362306a36Sopenharmony_ci } 71462306a36Sopenharmony_ci} 71562306a36Sopenharmony_ci 71662306a36Sopenharmony_ci/* 71762306a36Sopenharmony_ci * This finds an available workspace or allocates a new one. 71862306a36Sopenharmony_ci * If it's not possible to allocate a new one, waits until there's one. 71962306a36Sopenharmony_ci * Preallocation makes a forward progress guarantees and we do not return 72062306a36Sopenharmony_ci * errors. 72162306a36Sopenharmony_ci */ 72262306a36Sopenharmony_cistruct list_head *btrfs_get_workspace(int type, unsigned int level) 72362306a36Sopenharmony_ci{ 72462306a36Sopenharmony_ci struct workspace_manager *wsm; 72562306a36Sopenharmony_ci struct list_head *workspace; 72662306a36Sopenharmony_ci int cpus = num_online_cpus(); 72762306a36Sopenharmony_ci unsigned nofs_flag; 72862306a36Sopenharmony_ci struct list_head *idle_ws; 72962306a36Sopenharmony_ci spinlock_t *ws_lock; 73062306a36Sopenharmony_ci atomic_t *total_ws; 73162306a36Sopenharmony_ci wait_queue_head_t *ws_wait; 73262306a36Sopenharmony_ci int *free_ws; 73362306a36Sopenharmony_ci 73462306a36Sopenharmony_ci wsm = btrfs_compress_op[type]->workspace_manager; 73562306a36Sopenharmony_ci idle_ws = &wsm->idle_ws; 73662306a36Sopenharmony_ci ws_lock = &wsm->ws_lock; 73762306a36Sopenharmony_ci total_ws = &wsm->total_ws; 73862306a36Sopenharmony_ci ws_wait = &wsm->ws_wait; 73962306a36Sopenharmony_ci free_ws = &wsm->free_ws; 74062306a36Sopenharmony_ci 74162306a36Sopenharmony_ciagain: 74262306a36Sopenharmony_ci spin_lock(ws_lock); 74362306a36Sopenharmony_ci if (!list_empty(idle_ws)) { 74462306a36Sopenharmony_ci workspace = idle_ws->next; 74562306a36Sopenharmony_ci list_del(workspace); 74662306a36Sopenharmony_ci (*free_ws)--; 74762306a36Sopenharmony_ci spin_unlock(ws_lock); 74862306a36Sopenharmony_ci return workspace; 74962306a36Sopenharmony_ci 75062306a36Sopenharmony_ci } 75162306a36Sopenharmony_ci if (atomic_read(total_ws) > cpus) { 75262306a36Sopenharmony_ci DEFINE_WAIT(wait); 75362306a36Sopenharmony_ci 75462306a36Sopenharmony_ci spin_unlock(ws_lock); 75562306a36Sopenharmony_ci prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); 75662306a36Sopenharmony_ci if (atomic_read(total_ws) > cpus && !*free_ws) 75762306a36Sopenharmony_ci schedule(); 75862306a36Sopenharmony_ci finish_wait(ws_wait, &wait); 75962306a36Sopenharmony_ci goto again; 76062306a36Sopenharmony_ci } 76162306a36Sopenharmony_ci atomic_inc(total_ws); 76262306a36Sopenharmony_ci spin_unlock(ws_lock); 76362306a36Sopenharmony_ci 76462306a36Sopenharmony_ci /* 76562306a36Sopenharmony_ci * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have 76662306a36Sopenharmony_ci * to turn it off here because we might get called from the restricted 76762306a36Sopenharmony_ci * context of btrfs_compress_bio/btrfs_compress_pages 76862306a36Sopenharmony_ci */ 76962306a36Sopenharmony_ci nofs_flag = memalloc_nofs_save(); 77062306a36Sopenharmony_ci workspace = alloc_workspace(type, level); 77162306a36Sopenharmony_ci memalloc_nofs_restore(nofs_flag); 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci if (IS_ERR(workspace)) { 77462306a36Sopenharmony_ci atomic_dec(total_ws); 77562306a36Sopenharmony_ci wake_up(ws_wait); 77662306a36Sopenharmony_ci 77762306a36Sopenharmony_ci /* 77862306a36Sopenharmony_ci * Do not return the error but go back to waiting. There's a 77962306a36Sopenharmony_ci * workspace preallocated for each type and the compression 78062306a36Sopenharmony_ci * time is bounded so we get to a workspace eventually. This 78162306a36Sopenharmony_ci * makes our caller's life easier. 78262306a36Sopenharmony_ci * 78362306a36Sopenharmony_ci * To prevent silent and low-probability deadlocks (when the 78462306a36Sopenharmony_ci * initial preallocation fails), check if there are any 78562306a36Sopenharmony_ci * workspaces at all. 78662306a36Sopenharmony_ci */ 78762306a36Sopenharmony_ci if (atomic_read(total_ws) == 0) { 78862306a36Sopenharmony_ci static DEFINE_RATELIMIT_STATE(_rs, 78962306a36Sopenharmony_ci /* once per minute */ 60 * HZ, 79062306a36Sopenharmony_ci /* no burst */ 1); 79162306a36Sopenharmony_ci 79262306a36Sopenharmony_ci if (__ratelimit(&_rs)) { 79362306a36Sopenharmony_ci pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); 79462306a36Sopenharmony_ci } 79562306a36Sopenharmony_ci } 79662306a36Sopenharmony_ci goto again; 79762306a36Sopenharmony_ci } 79862306a36Sopenharmony_ci return workspace; 79962306a36Sopenharmony_ci} 80062306a36Sopenharmony_ci 80162306a36Sopenharmony_cistatic struct list_head *get_workspace(int type, int level) 80262306a36Sopenharmony_ci{ 80362306a36Sopenharmony_ci switch (type) { 80462306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); 80562306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); 80662306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); 80762306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); 80862306a36Sopenharmony_ci default: 80962306a36Sopenharmony_ci /* 81062306a36Sopenharmony_ci * This can't happen, the type is validated several times 81162306a36Sopenharmony_ci * before we get here. 81262306a36Sopenharmony_ci */ 81362306a36Sopenharmony_ci BUG(); 81462306a36Sopenharmony_ci } 81562306a36Sopenharmony_ci} 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci/* 81862306a36Sopenharmony_ci * put a workspace struct back on the list or free it if we have enough 81962306a36Sopenharmony_ci * idle ones sitting around 82062306a36Sopenharmony_ci */ 82162306a36Sopenharmony_civoid btrfs_put_workspace(int type, struct list_head *ws) 82262306a36Sopenharmony_ci{ 82362306a36Sopenharmony_ci struct workspace_manager *wsm; 82462306a36Sopenharmony_ci struct list_head *idle_ws; 82562306a36Sopenharmony_ci spinlock_t *ws_lock; 82662306a36Sopenharmony_ci atomic_t *total_ws; 82762306a36Sopenharmony_ci wait_queue_head_t *ws_wait; 82862306a36Sopenharmony_ci int *free_ws; 82962306a36Sopenharmony_ci 83062306a36Sopenharmony_ci wsm = btrfs_compress_op[type]->workspace_manager; 83162306a36Sopenharmony_ci idle_ws = &wsm->idle_ws; 83262306a36Sopenharmony_ci ws_lock = &wsm->ws_lock; 83362306a36Sopenharmony_ci total_ws = &wsm->total_ws; 83462306a36Sopenharmony_ci ws_wait = &wsm->ws_wait; 83562306a36Sopenharmony_ci free_ws = &wsm->free_ws; 83662306a36Sopenharmony_ci 83762306a36Sopenharmony_ci spin_lock(ws_lock); 83862306a36Sopenharmony_ci if (*free_ws <= num_online_cpus()) { 83962306a36Sopenharmony_ci list_add(ws, idle_ws); 84062306a36Sopenharmony_ci (*free_ws)++; 84162306a36Sopenharmony_ci spin_unlock(ws_lock); 84262306a36Sopenharmony_ci goto wake; 84362306a36Sopenharmony_ci } 84462306a36Sopenharmony_ci spin_unlock(ws_lock); 84562306a36Sopenharmony_ci 84662306a36Sopenharmony_ci free_workspace(type, ws); 84762306a36Sopenharmony_ci atomic_dec(total_ws); 84862306a36Sopenharmony_ciwake: 84962306a36Sopenharmony_ci cond_wake_up(ws_wait); 85062306a36Sopenharmony_ci} 85162306a36Sopenharmony_ci 85262306a36Sopenharmony_cistatic void put_workspace(int type, struct list_head *ws) 85362306a36Sopenharmony_ci{ 85462306a36Sopenharmony_ci switch (type) { 85562306a36Sopenharmony_ci case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); 85662306a36Sopenharmony_ci case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); 85762306a36Sopenharmony_ci case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); 85862306a36Sopenharmony_ci case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); 85962306a36Sopenharmony_ci default: 86062306a36Sopenharmony_ci /* 86162306a36Sopenharmony_ci * This can't happen, the type is validated several times 86262306a36Sopenharmony_ci * before we get here. 86362306a36Sopenharmony_ci */ 86462306a36Sopenharmony_ci BUG(); 86562306a36Sopenharmony_ci } 86662306a36Sopenharmony_ci} 86762306a36Sopenharmony_ci 86862306a36Sopenharmony_ci/* 86962306a36Sopenharmony_ci * Adjust @level according to the limits of the compression algorithm or 87062306a36Sopenharmony_ci * fallback to default 87162306a36Sopenharmony_ci */ 87262306a36Sopenharmony_cistatic unsigned int btrfs_compress_set_level(int type, unsigned level) 87362306a36Sopenharmony_ci{ 87462306a36Sopenharmony_ci const struct btrfs_compress_op *ops = btrfs_compress_op[type]; 87562306a36Sopenharmony_ci 87662306a36Sopenharmony_ci if (level == 0) 87762306a36Sopenharmony_ci level = ops->default_level; 87862306a36Sopenharmony_ci else 87962306a36Sopenharmony_ci level = min(level, ops->max_level); 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci return level; 88262306a36Sopenharmony_ci} 88362306a36Sopenharmony_ci 88462306a36Sopenharmony_ci/* 88562306a36Sopenharmony_ci * Given an address space and start and length, compress the bytes into @pages 88662306a36Sopenharmony_ci * that are allocated on demand. 88762306a36Sopenharmony_ci * 88862306a36Sopenharmony_ci * @type_level is encoded algorithm and level, where level 0 means whatever 88962306a36Sopenharmony_ci * default the algorithm chooses and is opaque here; 89062306a36Sopenharmony_ci * - compression algo are 0-3 89162306a36Sopenharmony_ci * - the level are bits 4-7 89262306a36Sopenharmony_ci * 89362306a36Sopenharmony_ci * @out_pages is an in/out parameter, holds maximum number of pages to allocate 89462306a36Sopenharmony_ci * and returns number of actually allocated pages 89562306a36Sopenharmony_ci * 89662306a36Sopenharmony_ci * @total_in is used to return the number of bytes actually read. It 89762306a36Sopenharmony_ci * may be smaller than the input length if we had to exit early because we 89862306a36Sopenharmony_ci * ran out of room in the pages array or because we cross the 89962306a36Sopenharmony_ci * max_out threshold. 90062306a36Sopenharmony_ci * 90162306a36Sopenharmony_ci * @total_out is an in/out parameter, must be set to the input length and will 90262306a36Sopenharmony_ci * be also used to return the total number of compressed bytes 90362306a36Sopenharmony_ci */ 90462306a36Sopenharmony_ciint btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 90562306a36Sopenharmony_ci u64 start, struct page **pages, 90662306a36Sopenharmony_ci unsigned long *out_pages, 90762306a36Sopenharmony_ci unsigned long *total_in, 90862306a36Sopenharmony_ci unsigned long *total_out) 90962306a36Sopenharmony_ci{ 91062306a36Sopenharmony_ci int type = btrfs_compress_type(type_level); 91162306a36Sopenharmony_ci int level = btrfs_compress_level(type_level); 91262306a36Sopenharmony_ci struct list_head *workspace; 91362306a36Sopenharmony_ci int ret; 91462306a36Sopenharmony_ci 91562306a36Sopenharmony_ci level = btrfs_compress_set_level(type, level); 91662306a36Sopenharmony_ci workspace = get_workspace(type, level); 91762306a36Sopenharmony_ci ret = compression_compress_pages(type, workspace, mapping, start, pages, 91862306a36Sopenharmony_ci out_pages, total_in, total_out); 91962306a36Sopenharmony_ci put_workspace(type, workspace); 92062306a36Sopenharmony_ci return ret; 92162306a36Sopenharmony_ci} 92262306a36Sopenharmony_ci 92362306a36Sopenharmony_cistatic int btrfs_decompress_bio(struct compressed_bio *cb) 92462306a36Sopenharmony_ci{ 92562306a36Sopenharmony_ci struct list_head *workspace; 92662306a36Sopenharmony_ci int ret; 92762306a36Sopenharmony_ci int type = cb->compress_type; 92862306a36Sopenharmony_ci 92962306a36Sopenharmony_ci workspace = get_workspace(type, 0); 93062306a36Sopenharmony_ci ret = compression_decompress_bio(workspace, cb); 93162306a36Sopenharmony_ci put_workspace(type, workspace); 93262306a36Sopenharmony_ci 93362306a36Sopenharmony_ci if (!ret) 93462306a36Sopenharmony_ci zero_fill_bio(&cb->orig_bbio->bio); 93562306a36Sopenharmony_ci return ret; 93662306a36Sopenharmony_ci} 93762306a36Sopenharmony_ci 93862306a36Sopenharmony_ci/* 93962306a36Sopenharmony_ci * a less complex decompression routine. Our compressed data fits in a 94062306a36Sopenharmony_ci * single page, and we want to read a single page out of it. 94162306a36Sopenharmony_ci * start_byte tells us the offset into the compressed data we're interested in 94262306a36Sopenharmony_ci */ 94362306a36Sopenharmony_ciint btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, 94462306a36Sopenharmony_ci unsigned long start_byte, size_t srclen, size_t destlen) 94562306a36Sopenharmony_ci{ 94662306a36Sopenharmony_ci struct list_head *workspace; 94762306a36Sopenharmony_ci int ret; 94862306a36Sopenharmony_ci 94962306a36Sopenharmony_ci workspace = get_workspace(type, 0); 95062306a36Sopenharmony_ci ret = compression_decompress(type, workspace, data_in, dest_page, 95162306a36Sopenharmony_ci start_byte, srclen, destlen); 95262306a36Sopenharmony_ci put_workspace(type, workspace); 95362306a36Sopenharmony_ci 95462306a36Sopenharmony_ci return ret; 95562306a36Sopenharmony_ci} 95662306a36Sopenharmony_ci 95762306a36Sopenharmony_ciint __init btrfs_init_compress(void) 95862306a36Sopenharmony_ci{ 95962306a36Sopenharmony_ci if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, 96062306a36Sopenharmony_ci offsetof(struct compressed_bio, bbio.bio), 96162306a36Sopenharmony_ci BIOSET_NEED_BVECS)) 96262306a36Sopenharmony_ci return -ENOMEM; 96362306a36Sopenharmony_ci btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); 96462306a36Sopenharmony_ci btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); 96562306a36Sopenharmony_ci btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); 96662306a36Sopenharmony_ci zstd_init_workspace_manager(); 96762306a36Sopenharmony_ci return 0; 96862306a36Sopenharmony_ci} 96962306a36Sopenharmony_ci 97062306a36Sopenharmony_civoid __cold btrfs_exit_compress(void) 97162306a36Sopenharmony_ci{ 97262306a36Sopenharmony_ci btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); 97362306a36Sopenharmony_ci btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); 97462306a36Sopenharmony_ci btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); 97562306a36Sopenharmony_ci zstd_cleanup_workspace_manager(); 97662306a36Sopenharmony_ci bioset_exit(&btrfs_compressed_bioset); 97762306a36Sopenharmony_ci} 97862306a36Sopenharmony_ci 97962306a36Sopenharmony_ci/* 98062306a36Sopenharmony_ci * Copy decompressed data from working buffer to pages. 98162306a36Sopenharmony_ci * 98262306a36Sopenharmony_ci * @buf: The decompressed data buffer 98362306a36Sopenharmony_ci * @buf_len: The decompressed data length 98462306a36Sopenharmony_ci * @decompressed: Number of bytes that are already decompressed inside the 98562306a36Sopenharmony_ci * compressed extent 98662306a36Sopenharmony_ci * @cb: The compressed extent descriptor 98762306a36Sopenharmony_ci * @orig_bio: The original bio that the caller wants to read for 98862306a36Sopenharmony_ci * 98962306a36Sopenharmony_ci * An easier to understand graph is like below: 99062306a36Sopenharmony_ci * 99162306a36Sopenharmony_ci * |<- orig_bio ->| |<- orig_bio->| 99262306a36Sopenharmony_ci * |<------- full decompressed extent ----->| 99362306a36Sopenharmony_ci * |<----------- @cb range ---->| 99462306a36Sopenharmony_ci * | |<-- @buf_len -->| 99562306a36Sopenharmony_ci * |<--- @decompressed --->| 99662306a36Sopenharmony_ci * 99762306a36Sopenharmony_ci * Note that, @cb can be a subpage of the full decompressed extent, but 99862306a36Sopenharmony_ci * @cb->start always has the same as the orig_file_offset value of the full 99962306a36Sopenharmony_ci * decompressed extent. 100062306a36Sopenharmony_ci * 100162306a36Sopenharmony_ci * When reading compressed extent, we have to read the full compressed extent, 100262306a36Sopenharmony_ci * while @orig_bio may only want part of the range. 100362306a36Sopenharmony_ci * Thus this function will ensure only data covered by @orig_bio will be copied 100462306a36Sopenharmony_ci * to. 100562306a36Sopenharmony_ci * 100662306a36Sopenharmony_ci * Return 0 if we have copied all needed contents for @orig_bio. 100762306a36Sopenharmony_ci * Return >0 if we need continue decompress. 100862306a36Sopenharmony_ci */ 100962306a36Sopenharmony_ciint btrfs_decompress_buf2page(const char *buf, u32 buf_len, 101062306a36Sopenharmony_ci struct compressed_bio *cb, u32 decompressed) 101162306a36Sopenharmony_ci{ 101262306a36Sopenharmony_ci struct bio *orig_bio = &cb->orig_bbio->bio; 101362306a36Sopenharmony_ci /* Offset inside the full decompressed extent */ 101462306a36Sopenharmony_ci u32 cur_offset; 101562306a36Sopenharmony_ci 101662306a36Sopenharmony_ci cur_offset = decompressed; 101762306a36Sopenharmony_ci /* The main loop to do the copy */ 101862306a36Sopenharmony_ci while (cur_offset < decompressed + buf_len) { 101962306a36Sopenharmony_ci struct bio_vec bvec; 102062306a36Sopenharmony_ci size_t copy_len; 102162306a36Sopenharmony_ci u32 copy_start; 102262306a36Sopenharmony_ci /* Offset inside the full decompressed extent */ 102362306a36Sopenharmony_ci u32 bvec_offset; 102462306a36Sopenharmony_ci 102562306a36Sopenharmony_ci bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); 102662306a36Sopenharmony_ci /* 102762306a36Sopenharmony_ci * cb->start may underflow, but subtracting that value can still 102862306a36Sopenharmony_ci * give us correct offset inside the full decompressed extent. 102962306a36Sopenharmony_ci */ 103062306a36Sopenharmony_ci bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; 103162306a36Sopenharmony_ci 103262306a36Sopenharmony_ci /* Haven't reached the bvec range, exit */ 103362306a36Sopenharmony_ci if (decompressed + buf_len <= bvec_offset) 103462306a36Sopenharmony_ci return 1; 103562306a36Sopenharmony_ci 103662306a36Sopenharmony_ci copy_start = max(cur_offset, bvec_offset); 103762306a36Sopenharmony_ci copy_len = min(bvec_offset + bvec.bv_len, 103862306a36Sopenharmony_ci decompressed + buf_len) - copy_start; 103962306a36Sopenharmony_ci ASSERT(copy_len); 104062306a36Sopenharmony_ci 104162306a36Sopenharmony_ci /* 104262306a36Sopenharmony_ci * Extra range check to ensure we didn't go beyond 104362306a36Sopenharmony_ci * @buf + @buf_len. 104462306a36Sopenharmony_ci */ 104562306a36Sopenharmony_ci ASSERT(copy_start - decompressed < buf_len); 104662306a36Sopenharmony_ci memcpy_to_page(bvec.bv_page, bvec.bv_offset, 104762306a36Sopenharmony_ci buf + copy_start - decompressed, copy_len); 104862306a36Sopenharmony_ci cur_offset += copy_len; 104962306a36Sopenharmony_ci 105062306a36Sopenharmony_ci bio_advance(orig_bio, copy_len); 105162306a36Sopenharmony_ci /* Finished the bio */ 105262306a36Sopenharmony_ci if (!orig_bio->bi_iter.bi_size) 105362306a36Sopenharmony_ci return 0; 105462306a36Sopenharmony_ci } 105562306a36Sopenharmony_ci return 1; 105662306a36Sopenharmony_ci} 105762306a36Sopenharmony_ci 105862306a36Sopenharmony_ci/* 105962306a36Sopenharmony_ci * Shannon Entropy calculation 106062306a36Sopenharmony_ci * 106162306a36Sopenharmony_ci * Pure byte distribution analysis fails to determine compressibility of data. 106262306a36Sopenharmony_ci * Try calculating entropy to estimate the average minimum number of bits 106362306a36Sopenharmony_ci * needed to encode the sampled data. 106462306a36Sopenharmony_ci * 106562306a36Sopenharmony_ci * For convenience, return the percentage of needed bits, instead of amount of 106662306a36Sopenharmony_ci * bits directly. 106762306a36Sopenharmony_ci * 106862306a36Sopenharmony_ci * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy 106962306a36Sopenharmony_ci * and can be compressible with high probability 107062306a36Sopenharmony_ci * 107162306a36Sopenharmony_ci * @ENTROPY_LVL_HIGH - data are not compressible with high probability 107262306a36Sopenharmony_ci * 107362306a36Sopenharmony_ci * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. 107462306a36Sopenharmony_ci */ 107562306a36Sopenharmony_ci#define ENTROPY_LVL_ACEPTABLE (65) 107662306a36Sopenharmony_ci#define ENTROPY_LVL_HIGH (80) 107762306a36Sopenharmony_ci 107862306a36Sopenharmony_ci/* 107962306a36Sopenharmony_ci * For increasead precision in shannon_entropy calculation, 108062306a36Sopenharmony_ci * let's do pow(n, M) to save more digits after comma: 108162306a36Sopenharmony_ci * 108262306a36Sopenharmony_ci * - maximum int bit length is 64 108362306a36Sopenharmony_ci * - ilog2(MAX_SAMPLE_SIZE) -> 13 108462306a36Sopenharmony_ci * - 13 * 4 = 52 < 64 -> M = 4 108562306a36Sopenharmony_ci * 108662306a36Sopenharmony_ci * So use pow(n, 4). 108762306a36Sopenharmony_ci */ 108862306a36Sopenharmony_cistatic inline u32 ilog2_w(u64 n) 108962306a36Sopenharmony_ci{ 109062306a36Sopenharmony_ci return ilog2(n * n * n * n); 109162306a36Sopenharmony_ci} 109262306a36Sopenharmony_ci 109362306a36Sopenharmony_cistatic u32 shannon_entropy(struct heuristic_ws *ws) 109462306a36Sopenharmony_ci{ 109562306a36Sopenharmony_ci const u32 entropy_max = 8 * ilog2_w(2); 109662306a36Sopenharmony_ci u32 entropy_sum = 0; 109762306a36Sopenharmony_ci u32 p, p_base, sz_base; 109862306a36Sopenharmony_ci u32 i; 109962306a36Sopenharmony_ci 110062306a36Sopenharmony_ci sz_base = ilog2_w(ws->sample_size); 110162306a36Sopenharmony_ci for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { 110262306a36Sopenharmony_ci p = ws->bucket[i].count; 110362306a36Sopenharmony_ci p_base = ilog2_w(p); 110462306a36Sopenharmony_ci entropy_sum += p * (sz_base - p_base); 110562306a36Sopenharmony_ci } 110662306a36Sopenharmony_ci 110762306a36Sopenharmony_ci entropy_sum /= ws->sample_size; 110862306a36Sopenharmony_ci return entropy_sum * 100 / entropy_max; 110962306a36Sopenharmony_ci} 111062306a36Sopenharmony_ci 111162306a36Sopenharmony_ci#define RADIX_BASE 4U 111262306a36Sopenharmony_ci#define COUNTERS_SIZE (1U << RADIX_BASE) 111362306a36Sopenharmony_ci 111462306a36Sopenharmony_cistatic u8 get4bits(u64 num, int shift) { 111562306a36Sopenharmony_ci u8 low4bits; 111662306a36Sopenharmony_ci 111762306a36Sopenharmony_ci num >>= shift; 111862306a36Sopenharmony_ci /* Reverse order */ 111962306a36Sopenharmony_ci low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); 112062306a36Sopenharmony_ci return low4bits; 112162306a36Sopenharmony_ci} 112262306a36Sopenharmony_ci 112362306a36Sopenharmony_ci/* 112462306a36Sopenharmony_ci * Use 4 bits as radix base 112562306a36Sopenharmony_ci * Use 16 u32 counters for calculating new position in buf array 112662306a36Sopenharmony_ci * 112762306a36Sopenharmony_ci * @array - array that will be sorted 112862306a36Sopenharmony_ci * @array_buf - buffer array to store sorting results 112962306a36Sopenharmony_ci * must be equal in size to @array 113062306a36Sopenharmony_ci * @num - array size 113162306a36Sopenharmony_ci */ 113262306a36Sopenharmony_cistatic void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, 113362306a36Sopenharmony_ci int num) 113462306a36Sopenharmony_ci{ 113562306a36Sopenharmony_ci u64 max_num; 113662306a36Sopenharmony_ci u64 buf_num; 113762306a36Sopenharmony_ci u32 counters[COUNTERS_SIZE]; 113862306a36Sopenharmony_ci u32 new_addr; 113962306a36Sopenharmony_ci u32 addr; 114062306a36Sopenharmony_ci int bitlen; 114162306a36Sopenharmony_ci int shift; 114262306a36Sopenharmony_ci int i; 114362306a36Sopenharmony_ci 114462306a36Sopenharmony_ci /* 114562306a36Sopenharmony_ci * Try avoid useless loop iterations for small numbers stored in big 114662306a36Sopenharmony_ci * counters. Example: 48 33 4 ... in 64bit array 114762306a36Sopenharmony_ci */ 114862306a36Sopenharmony_ci max_num = array[0].count; 114962306a36Sopenharmony_ci for (i = 1; i < num; i++) { 115062306a36Sopenharmony_ci buf_num = array[i].count; 115162306a36Sopenharmony_ci if (buf_num > max_num) 115262306a36Sopenharmony_ci max_num = buf_num; 115362306a36Sopenharmony_ci } 115462306a36Sopenharmony_ci 115562306a36Sopenharmony_ci buf_num = ilog2(max_num); 115662306a36Sopenharmony_ci bitlen = ALIGN(buf_num, RADIX_BASE * 2); 115762306a36Sopenharmony_ci 115862306a36Sopenharmony_ci shift = 0; 115962306a36Sopenharmony_ci while (shift < bitlen) { 116062306a36Sopenharmony_ci memset(counters, 0, sizeof(counters)); 116162306a36Sopenharmony_ci 116262306a36Sopenharmony_ci for (i = 0; i < num; i++) { 116362306a36Sopenharmony_ci buf_num = array[i].count; 116462306a36Sopenharmony_ci addr = get4bits(buf_num, shift); 116562306a36Sopenharmony_ci counters[addr]++; 116662306a36Sopenharmony_ci } 116762306a36Sopenharmony_ci 116862306a36Sopenharmony_ci for (i = 1; i < COUNTERS_SIZE; i++) 116962306a36Sopenharmony_ci counters[i] += counters[i - 1]; 117062306a36Sopenharmony_ci 117162306a36Sopenharmony_ci for (i = num - 1; i >= 0; i--) { 117262306a36Sopenharmony_ci buf_num = array[i].count; 117362306a36Sopenharmony_ci addr = get4bits(buf_num, shift); 117462306a36Sopenharmony_ci counters[addr]--; 117562306a36Sopenharmony_ci new_addr = counters[addr]; 117662306a36Sopenharmony_ci array_buf[new_addr] = array[i]; 117762306a36Sopenharmony_ci } 117862306a36Sopenharmony_ci 117962306a36Sopenharmony_ci shift += RADIX_BASE; 118062306a36Sopenharmony_ci 118162306a36Sopenharmony_ci /* 118262306a36Sopenharmony_ci * Normal radix expects to move data from a temporary array, to 118362306a36Sopenharmony_ci * the main one. But that requires some CPU time. Avoid that 118462306a36Sopenharmony_ci * by doing another sort iteration to original array instead of 118562306a36Sopenharmony_ci * memcpy() 118662306a36Sopenharmony_ci */ 118762306a36Sopenharmony_ci memset(counters, 0, sizeof(counters)); 118862306a36Sopenharmony_ci 118962306a36Sopenharmony_ci for (i = 0; i < num; i ++) { 119062306a36Sopenharmony_ci buf_num = array_buf[i].count; 119162306a36Sopenharmony_ci addr = get4bits(buf_num, shift); 119262306a36Sopenharmony_ci counters[addr]++; 119362306a36Sopenharmony_ci } 119462306a36Sopenharmony_ci 119562306a36Sopenharmony_ci for (i = 1; i < COUNTERS_SIZE; i++) 119662306a36Sopenharmony_ci counters[i] += counters[i - 1]; 119762306a36Sopenharmony_ci 119862306a36Sopenharmony_ci for (i = num - 1; i >= 0; i--) { 119962306a36Sopenharmony_ci buf_num = array_buf[i].count; 120062306a36Sopenharmony_ci addr = get4bits(buf_num, shift); 120162306a36Sopenharmony_ci counters[addr]--; 120262306a36Sopenharmony_ci new_addr = counters[addr]; 120362306a36Sopenharmony_ci array[new_addr] = array_buf[i]; 120462306a36Sopenharmony_ci } 120562306a36Sopenharmony_ci 120662306a36Sopenharmony_ci shift += RADIX_BASE; 120762306a36Sopenharmony_ci } 120862306a36Sopenharmony_ci} 120962306a36Sopenharmony_ci 121062306a36Sopenharmony_ci/* 121162306a36Sopenharmony_ci * Size of the core byte set - how many bytes cover 90% of the sample 121262306a36Sopenharmony_ci * 121362306a36Sopenharmony_ci * There are several types of structured binary data that use nearly all byte 121462306a36Sopenharmony_ci * values. The distribution can be uniform and counts in all buckets will be 121562306a36Sopenharmony_ci * nearly the same (eg. encrypted data). Unlikely to be compressible. 121662306a36Sopenharmony_ci * 121762306a36Sopenharmony_ci * Other possibility is normal (Gaussian) distribution, where the data could 121862306a36Sopenharmony_ci * be potentially compressible, but we have to take a few more steps to decide 121962306a36Sopenharmony_ci * how much. 122062306a36Sopenharmony_ci * 122162306a36Sopenharmony_ci * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, 122262306a36Sopenharmony_ci * compression algo can easy fix that 122362306a36Sopenharmony_ci * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high 122462306a36Sopenharmony_ci * probability is not compressible 122562306a36Sopenharmony_ci */ 122662306a36Sopenharmony_ci#define BYTE_CORE_SET_LOW (64) 122762306a36Sopenharmony_ci#define BYTE_CORE_SET_HIGH (200) 122862306a36Sopenharmony_ci 122962306a36Sopenharmony_cistatic int byte_core_set_size(struct heuristic_ws *ws) 123062306a36Sopenharmony_ci{ 123162306a36Sopenharmony_ci u32 i; 123262306a36Sopenharmony_ci u32 coreset_sum = 0; 123362306a36Sopenharmony_ci const u32 core_set_threshold = ws->sample_size * 90 / 100; 123462306a36Sopenharmony_ci struct bucket_item *bucket = ws->bucket; 123562306a36Sopenharmony_ci 123662306a36Sopenharmony_ci /* Sort in reverse order */ 123762306a36Sopenharmony_ci radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); 123862306a36Sopenharmony_ci 123962306a36Sopenharmony_ci for (i = 0; i < BYTE_CORE_SET_LOW; i++) 124062306a36Sopenharmony_ci coreset_sum += bucket[i].count; 124162306a36Sopenharmony_ci 124262306a36Sopenharmony_ci if (coreset_sum > core_set_threshold) 124362306a36Sopenharmony_ci return i; 124462306a36Sopenharmony_ci 124562306a36Sopenharmony_ci for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { 124662306a36Sopenharmony_ci coreset_sum += bucket[i].count; 124762306a36Sopenharmony_ci if (coreset_sum > core_set_threshold) 124862306a36Sopenharmony_ci break; 124962306a36Sopenharmony_ci } 125062306a36Sopenharmony_ci 125162306a36Sopenharmony_ci return i; 125262306a36Sopenharmony_ci} 125362306a36Sopenharmony_ci 125462306a36Sopenharmony_ci/* 125562306a36Sopenharmony_ci * Count byte values in buckets. 125662306a36Sopenharmony_ci * This heuristic can detect textual data (configs, xml, json, html, etc). 125762306a36Sopenharmony_ci * Because in most text-like data byte set is restricted to limited number of 125862306a36Sopenharmony_ci * possible characters, and that restriction in most cases makes data easy to 125962306a36Sopenharmony_ci * compress. 126062306a36Sopenharmony_ci * 126162306a36Sopenharmony_ci * @BYTE_SET_THRESHOLD - consider all data within this byte set size: 126262306a36Sopenharmony_ci * less - compressible 126362306a36Sopenharmony_ci * more - need additional analysis 126462306a36Sopenharmony_ci */ 126562306a36Sopenharmony_ci#define BYTE_SET_THRESHOLD (64) 126662306a36Sopenharmony_ci 126762306a36Sopenharmony_cistatic u32 byte_set_size(const struct heuristic_ws *ws) 126862306a36Sopenharmony_ci{ 126962306a36Sopenharmony_ci u32 i; 127062306a36Sopenharmony_ci u32 byte_set_size = 0; 127162306a36Sopenharmony_ci 127262306a36Sopenharmony_ci for (i = 0; i < BYTE_SET_THRESHOLD; i++) { 127362306a36Sopenharmony_ci if (ws->bucket[i].count > 0) 127462306a36Sopenharmony_ci byte_set_size++; 127562306a36Sopenharmony_ci } 127662306a36Sopenharmony_ci 127762306a36Sopenharmony_ci /* 127862306a36Sopenharmony_ci * Continue collecting count of byte values in buckets. If the byte 127962306a36Sopenharmony_ci * set size is bigger then the threshold, it's pointless to continue, 128062306a36Sopenharmony_ci * the detection technique would fail for this type of data. 128162306a36Sopenharmony_ci */ 128262306a36Sopenharmony_ci for (; i < BUCKET_SIZE; i++) { 128362306a36Sopenharmony_ci if (ws->bucket[i].count > 0) { 128462306a36Sopenharmony_ci byte_set_size++; 128562306a36Sopenharmony_ci if (byte_set_size > BYTE_SET_THRESHOLD) 128662306a36Sopenharmony_ci return byte_set_size; 128762306a36Sopenharmony_ci } 128862306a36Sopenharmony_ci } 128962306a36Sopenharmony_ci 129062306a36Sopenharmony_ci return byte_set_size; 129162306a36Sopenharmony_ci} 129262306a36Sopenharmony_ci 129362306a36Sopenharmony_cistatic bool sample_repeated_patterns(struct heuristic_ws *ws) 129462306a36Sopenharmony_ci{ 129562306a36Sopenharmony_ci const u32 half_of_sample = ws->sample_size / 2; 129662306a36Sopenharmony_ci const u8 *data = ws->sample; 129762306a36Sopenharmony_ci 129862306a36Sopenharmony_ci return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; 129962306a36Sopenharmony_ci} 130062306a36Sopenharmony_ci 130162306a36Sopenharmony_cistatic void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, 130262306a36Sopenharmony_ci struct heuristic_ws *ws) 130362306a36Sopenharmony_ci{ 130462306a36Sopenharmony_ci struct page *page; 130562306a36Sopenharmony_ci u64 index, index_end; 130662306a36Sopenharmony_ci u32 i, curr_sample_pos; 130762306a36Sopenharmony_ci u8 *in_data; 130862306a36Sopenharmony_ci 130962306a36Sopenharmony_ci /* 131062306a36Sopenharmony_ci * Compression handles the input data by chunks of 128KiB 131162306a36Sopenharmony_ci * (defined by BTRFS_MAX_UNCOMPRESSED) 131262306a36Sopenharmony_ci * 131362306a36Sopenharmony_ci * We do the same for the heuristic and loop over the whole range. 131462306a36Sopenharmony_ci * 131562306a36Sopenharmony_ci * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will 131662306a36Sopenharmony_ci * process no more than BTRFS_MAX_UNCOMPRESSED at a time. 131762306a36Sopenharmony_ci */ 131862306a36Sopenharmony_ci if (end - start > BTRFS_MAX_UNCOMPRESSED) 131962306a36Sopenharmony_ci end = start + BTRFS_MAX_UNCOMPRESSED; 132062306a36Sopenharmony_ci 132162306a36Sopenharmony_ci index = start >> PAGE_SHIFT; 132262306a36Sopenharmony_ci index_end = end >> PAGE_SHIFT; 132362306a36Sopenharmony_ci 132462306a36Sopenharmony_ci /* Don't miss unaligned end */ 132562306a36Sopenharmony_ci if (!PAGE_ALIGNED(end)) 132662306a36Sopenharmony_ci index_end++; 132762306a36Sopenharmony_ci 132862306a36Sopenharmony_ci curr_sample_pos = 0; 132962306a36Sopenharmony_ci while (index < index_end) { 133062306a36Sopenharmony_ci page = find_get_page(inode->i_mapping, index); 133162306a36Sopenharmony_ci in_data = kmap_local_page(page); 133262306a36Sopenharmony_ci /* Handle case where the start is not aligned to PAGE_SIZE */ 133362306a36Sopenharmony_ci i = start % PAGE_SIZE; 133462306a36Sopenharmony_ci while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { 133562306a36Sopenharmony_ci /* Don't sample any garbage from the last page */ 133662306a36Sopenharmony_ci if (start > end - SAMPLING_READ_SIZE) 133762306a36Sopenharmony_ci break; 133862306a36Sopenharmony_ci memcpy(&ws->sample[curr_sample_pos], &in_data[i], 133962306a36Sopenharmony_ci SAMPLING_READ_SIZE); 134062306a36Sopenharmony_ci i += SAMPLING_INTERVAL; 134162306a36Sopenharmony_ci start += SAMPLING_INTERVAL; 134262306a36Sopenharmony_ci curr_sample_pos += SAMPLING_READ_SIZE; 134362306a36Sopenharmony_ci } 134462306a36Sopenharmony_ci kunmap_local(in_data); 134562306a36Sopenharmony_ci put_page(page); 134662306a36Sopenharmony_ci 134762306a36Sopenharmony_ci index++; 134862306a36Sopenharmony_ci } 134962306a36Sopenharmony_ci 135062306a36Sopenharmony_ci ws->sample_size = curr_sample_pos; 135162306a36Sopenharmony_ci} 135262306a36Sopenharmony_ci 135362306a36Sopenharmony_ci/* 135462306a36Sopenharmony_ci * Compression heuristic. 135562306a36Sopenharmony_ci * 135662306a36Sopenharmony_ci * For now is's a naive and optimistic 'return true', we'll extend the logic to 135762306a36Sopenharmony_ci * quickly (compared to direct compression) detect data characteristics 135862306a36Sopenharmony_ci * (compressible/incompressible) to avoid wasting CPU time on incompressible 135962306a36Sopenharmony_ci * data. 136062306a36Sopenharmony_ci * 136162306a36Sopenharmony_ci * The following types of analysis can be performed: 136262306a36Sopenharmony_ci * - detect mostly zero data 136362306a36Sopenharmony_ci * - detect data with low "byte set" size (text, etc) 136462306a36Sopenharmony_ci * - detect data with low/high "core byte" set 136562306a36Sopenharmony_ci * 136662306a36Sopenharmony_ci * Return non-zero if the compression should be done, 0 otherwise. 136762306a36Sopenharmony_ci */ 136862306a36Sopenharmony_ciint btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 136962306a36Sopenharmony_ci{ 137062306a36Sopenharmony_ci struct list_head *ws_list = get_workspace(0, 0); 137162306a36Sopenharmony_ci struct heuristic_ws *ws; 137262306a36Sopenharmony_ci u32 i; 137362306a36Sopenharmony_ci u8 byte; 137462306a36Sopenharmony_ci int ret = 0; 137562306a36Sopenharmony_ci 137662306a36Sopenharmony_ci ws = list_entry(ws_list, struct heuristic_ws, list); 137762306a36Sopenharmony_ci 137862306a36Sopenharmony_ci heuristic_collect_sample(inode, start, end, ws); 137962306a36Sopenharmony_ci 138062306a36Sopenharmony_ci if (sample_repeated_patterns(ws)) { 138162306a36Sopenharmony_ci ret = 1; 138262306a36Sopenharmony_ci goto out; 138362306a36Sopenharmony_ci } 138462306a36Sopenharmony_ci 138562306a36Sopenharmony_ci memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); 138662306a36Sopenharmony_ci 138762306a36Sopenharmony_ci for (i = 0; i < ws->sample_size; i++) { 138862306a36Sopenharmony_ci byte = ws->sample[i]; 138962306a36Sopenharmony_ci ws->bucket[byte].count++; 139062306a36Sopenharmony_ci } 139162306a36Sopenharmony_ci 139262306a36Sopenharmony_ci i = byte_set_size(ws); 139362306a36Sopenharmony_ci if (i < BYTE_SET_THRESHOLD) { 139462306a36Sopenharmony_ci ret = 2; 139562306a36Sopenharmony_ci goto out; 139662306a36Sopenharmony_ci } 139762306a36Sopenharmony_ci 139862306a36Sopenharmony_ci i = byte_core_set_size(ws); 139962306a36Sopenharmony_ci if (i <= BYTE_CORE_SET_LOW) { 140062306a36Sopenharmony_ci ret = 3; 140162306a36Sopenharmony_ci goto out; 140262306a36Sopenharmony_ci } 140362306a36Sopenharmony_ci 140462306a36Sopenharmony_ci if (i >= BYTE_CORE_SET_HIGH) { 140562306a36Sopenharmony_ci ret = 0; 140662306a36Sopenharmony_ci goto out; 140762306a36Sopenharmony_ci } 140862306a36Sopenharmony_ci 140962306a36Sopenharmony_ci i = shannon_entropy(ws); 141062306a36Sopenharmony_ci if (i <= ENTROPY_LVL_ACEPTABLE) { 141162306a36Sopenharmony_ci ret = 4; 141262306a36Sopenharmony_ci goto out; 141362306a36Sopenharmony_ci } 141462306a36Sopenharmony_ci 141562306a36Sopenharmony_ci /* 141662306a36Sopenharmony_ci * For the levels below ENTROPY_LVL_HIGH, additional analysis would be 141762306a36Sopenharmony_ci * needed to give green light to compression. 141862306a36Sopenharmony_ci * 141962306a36Sopenharmony_ci * For now just assume that compression at that level is not worth the 142062306a36Sopenharmony_ci * resources because: 142162306a36Sopenharmony_ci * 142262306a36Sopenharmony_ci * 1. it is possible to defrag the data later 142362306a36Sopenharmony_ci * 142462306a36Sopenharmony_ci * 2. the data would turn out to be hardly compressible, eg. 150 byte 142562306a36Sopenharmony_ci * values, every bucket has counter at level ~54. The heuristic would 142662306a36Sopenharmony_ci * be confused. This can happen when data have some internal repeated 142762306a36Sopenharmony_ci * patterns like "abbacbbc...". This can be detected by analyzing 142862306a36Sopenharmony_ci * pairs of bytes, which is too costly. 142962306a36Sopenharmony_ci */ 143062306a36Sopenharmony_ci if (i < ENTROPY_LVL_HIGH) { 143162306a36Sopenharmony_ci ret = 5; 143262306a36Sopenharmony_ci goto out; 143362306a36Sopenharmony_ci } else { 143462306a36Sopenharmony_ci ret = 0; 143562306a36Sopenharmony_ci goto out; 143662306a36Sopenharmony_ci } 143762306a36Sopenharmony_ci 143862306a36Sopenharmony_ciout: 143962306a36Sopenharmony_ci put_workspace(0, ws_list); 144062306a36Sopenharmony_ci return ret; 144162306a36Sopenharmony_ci} 144262306a36Sopenharmony_ci 144362306a36Sopenharmony_ci/* 144462306a36Sopenharmony_ci * Convert the compression suffix (eg. after "zlib" starting with ":") to 144562306a36Sopenharmony_ci * level, unrecognized string will set the default level 144662306a36Sopenharmony_ci */ 144762306a36Sopenharmony_ciunsigned int btrfs_compress_str2level(unsigned int type, const char *str) 144862306a36Sopenharmony_ci{ 144962306a36Sopenharmony_ci unsigned int level = 0; 145062306a36Sopenharmony_ci int ret; 145162306a36Sopenharmony_ci 145262306a36Sopenharmony_ci if (!type) 145362306a36Sopenharmony_ci return 0; 145462306a36Sopenharmony_ci 145562306a36Sopenharmony_ci if (str[0] == ':') { 145662306a36Sopenharmony_ci ret = kstrtouint(str + 1, 10, &level); 145762306a36Sopenharmony_ci if (ret) 145862306a36Sopenharmony_ci level = 0; 145962306a36Sopenharmony_ci } 146062306a36Sopenharmony_ci 146162306a36Sopenharmony_ci level = btrfs_compress_set_level(type, level); 146262306a36Sopenharmony_ci 146362306a36Sopenharmony_ci return level; 146462306a36Sopenharmony_ci} 1465