1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6#include <linux/kernel.h> 7#include <linux/bio.h> 8#include <linux/file.h> 9#include <linux/fs.h> 10#include <linux/pagemap.h> 11#include <linux/highmem.h> 12#include <linux/time.h> 13#include <linux/init.h> 14#include <linux/string.h> 15#include <linux/backing-dev.h> 16#include <linux/writeback.h> 17#include <linux/slab.h> 18#include <linux/sched/mm.h> 19#include <linux/log2.h> 20#include <crypto/hash.h> 21#include "misc.h" 22#include "ctree.h" 23#include "disk-io.h" 24#include "transaction.h" 25#include "btrfs_inode.h" 26#include "volumes.h" 27#include "ordered-data.h" 28#include "compression.h" 29#include "extent_io.h" 30#include "extent_map.h" 31 32static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; 33 34const char* btrfs_compress_type2str(enum btrfs_compression_type type) 35{ 36 switch (type) { 37 case BTRFS_COMPRESS_ZLIB: 38 case BTRFS_COMPRESS_LZO: 39 case BTRFS_COMPRESS_ZSTD: 40 case BTRFS_COMPRESS_NONE: 41 return btrfs_compress_types[type]; 42 default: 43 break; 44 } 45 46 return NULL; 47} 48 49bool btrfs_compress_is_valid_type(const char *str, size_t len) 50{ 51 int i; 52 53 for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { 54 size_t comp_len = strlen(btrfs_compress_types[i]); 55 56 if (len < comp_len) 57 continue; 58 59 if (!strncmp(btrfs_compress_types[i], str, comp_len)) 60 return true; 61 } 62 return false; 63} 64 65static int compression_compress_pages(int type, struct list_head *ws, 66 struct address_space *mapping, u64 start, struct page **pages, 67 unsigned long *out_pages, unsigned long *total_in, 68 unsigned long *total_out) 69{ 70 switch (type) { 71 case BTRFS_COMPRESS_ZLIB: 72 return zlib_compress_pages(ws, mapping, start, pages, 73 out_pages, total_in, total_out); 74 case BTRFS_COMPRESS_LZO: 75 return lzo_compress_pages(ws, mapping, start, pages, 76 out_pages, total_in, total_out); 77 case BTRFS_COMPRESS_ZSTD: 78 return zstd_compress_pages(ws, mapping, start, pages, 79 out_pages, total_in, total_out); 80 case BTRFS_COMPRESS_NONE: 81 default: 82 /* 83 * This can happen when compression races with remount setting 84 * it to 'no compress', while caller doesn't call 85 * inode_need_compress() to check if we really need to 86 * compress. 87 * 88 * Not a big deal, just need to inform caller that we 89 * haven't allocated any pages yet. 90 */ 91 *out_pages = 0; 92 return -E2BIG; 93 } 94} 95 96static int compression_decompress_bio(int type, struct list_head *ws, 97 struct compressed_bio *cb) 98{ 99 switch (type) { 100 case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); 101 case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); 102 case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); 103 case BTRFS_COMPRESS_NONE: 104 default: 105 /* 106 * This can't happen, the type is validated several times 107 * before we get here. 108 */ 109 BUG(); 110 } 111} 112 113static int compression_decompress(int type, struct list_head *ws, 114 unsigned char *data_in, struct page *dest_page, 115 unsigned long start_byte, size_t srclen, size_t destlen) 116{ 117 switch (type) { 118 case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, 119 start_byte, srclen, destlen); 120 case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, 121 start_byte, srclen, destlen); 122 case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, 123 start_byte, srclen, destlen); 124 case BTRFS_COMPRESS_NONE: 125 default: 126 /* 127 * This can't happen, the type is validated several times 128 * before we get here. 129 */ 130 BUG(); 131 } 132} 133 134static int btrfs_decompress_bio(struct compressed_bio *cb); 135 136static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, 137 unsigned long disk_size) 138{ 139 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 140 141 return sizeof(struct compressed_bio) + 142 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; 143} 144 145static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, 146 u64 disk_start) 147{ 148 struct btrfs_fs_info *fs_info = inode->root->fs_info; 149 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 150 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 151 struct page *page; 152 unsigned long i; 153 char *kaddr; 154 u8 csum[BTRFS_CSUM_SIZE]; 155 struct compressed_bio *cb = bio->bi_private; 156 u8 *cb_sum = cb->sums; 157 158 if (inode->flags & BTRFS_INODE_NODATASUM) 159 return 0; 160 161 shash->tfm = fs_info->csum_shash; 162 163 for (i = 0; i < cb->nr_pages; i++) { 164 page = cb->compressed_pages[i]; 165 166 kaddr = kmap_atomic(page); 167 crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum); 168 kunmap_atomic(kaddr); 169 170 if (memcmp(&csum, cb_sum, csum_size)) { 171 btrfs_print_data_csum_error(inode, disk_start, 172 csum, cb_sum, cb->mirror_num); 173 if (btrfs_io_bio(bio)->device) 174 btrfs_dev_stat_inc_and_print( 175 btrfs_io_bio(bio)->device, 176 BTRFS_DEV_STAT_CORRUPTION_ERRS); 177 return -EIO; 178 } 179 cb_sum += csum_size; 180 } 181 return 0; 182} 183 184/* when we finish reading compressed pages from the disk, we 185 * decompress them and then run the bio end_io routines on the 186 * decompressed pages (in the inode address space). 187 * 188 * This allows the checksumming and other IO error handling routines 189 * to work normally 190 * 191 * The compressed pages are freed here, and it must be run 192 * in process context 193 */ 194static void end_compressed_bio_read(struct bio *bio) 195{ 196 struct compressed_bio *cb = bio->bi_private; 197 struct inode *inode; 198 struct page *page; 199 unsigned long index; 200 unsigned int mirror = btrfs_io_bio(bio)->mirror_num; 201 int ret = 0; 202 203 if (bio->bi_status) 204 cb->errors = 1; 205 206 /* if there are more bios still pending for this compressed 207 * extent, just exit 208 */ 209 if (!refcount_dec_and_test(&cb->pending_bios)) 210 goto out; 211 212 /* 213 * Record the correct mirror_num in cb->orig_bio so that 214 * read-repair can work properly. 215 */ 216 btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; 217 cb->mirror_num = mirror; 218 219 /* 220 * Some IO in this cb have failed, just skip checksum as there 221 * is no way it could be correct. 222 */ 223 if (cb->errors == 1) 224 goto csum_failed; 225 226 inode = cb->inode; 227 ret = check_compressed_csum(BTRFS_I(inode), bio, 228 (u64)bio->bi_iter.bi_sector << 9); 229 if (ret) 230 goto csum_failed; 231 232 /* ok, we're the last bio for this extent, lets start 233 * the decompression. 234 */ 235 ret = btrfs_decompress_bio(cb); 236 237csum_failed: 238 if (ret) 239 cb->errors = 1; 240 241 /* release the compressed pages */ 242 index = 0; 243 for (index = 0; index < cb->nr_pages; index++) { 244 page = cb->compressed_pages[index]; 245 page->mapping = NULL; 246 put_page(page); 247 } 248 249 /* do io completion on the original bio */ 250 if (cb->errors) { 251 bio_io_error(cb->orig_bio); 252 } else { 253 struct bio_vec *bvec; 254 struct bvec_iter_all iter_all; 255 256 /* 257 * we have verified the checksum already, set page 258 * checked so the end_io handlers know about it 259 */ 260 ASSERT(!bio_flagged(bio, BIO_CLONED)); 261 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) 262 SetPageChecked(bvec->bv_page); 263 264 bio_endio(cb->orig_bio); 265 } 266 267 /* finally free the cb struct */ 268 kfree(cb->compressed_pages); 269 kfree(cb); 270out: 271 bio_put(bio); 272} 273 274/* 275 * Clear the writeback bits on all of the file 276 * pages for a compressed write 277 */ 278static noinline void end_compressed_writeback(struct inode *inode, 279 const struct compressed_bio *cb) 280{ 281 unsigned long index = cb->start >> PAGE_SHIFT; 282 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; 283 struct page *pages[16]; 284 unsigned long nr_pages = end_index - index + 1; 285 int i; 286 int ret; 287 288 if (cb->errors) 289 mapping_set_error(inode->i_mapping, -EIO); 290 291 while (nr_pages > 0) { 292 ret = find_get_pages_contig(inode->i_mapping, index, 293 min_t(unsigned long, 294 nr_pages, ARRAY_SIZE(pages)), pages); 295 if (ret == 0) { 296 nr_pages -= 1; 297 index += 1; 298 continue; 299 } 300 for (i = 0; i < ret; i++) { 301 if (cb->errors) 302 SetPageError(pages[i]); 303 end_page_writeback(pages[i]); 304 put_page(pages[i]); 305 } 306 nr_pages -= ret; 307 index += ret; 308 } 309 /* the inode may be gone now */ 310} 311 312/* 313 * do the cleanup once all the compressed pages hit the disk. 314 * This will clear writeback on the file pages and free the compressed 315 * pages. 316 * 317 * This also calls the writeback end hooks for the file pages so that 318 * metadata and checksums can be updated in the file. 319 */ 320static void end_compressed_bio_write(struct bio *bio) 321{ 322 struct compressed_bio *cb = bio->bi_private; 323 struct inode *inode; 324 struct page *page; 325 unsigned long index; 326 327 if (bio->bi_status) 328 cb->errors = 1; 329 330 /* if there are more bios still pending for this compressed 331 * extent, just exit 332 */ 333 if (!refcount_dec_and_test(&cb->pending_bios)) 334 goto out; 335 336 /* ok, we're the last bio for this extent, step one is to 337 * call back into the FS and do all the end_io operations 338 */ 339 inode = cb->inode; 340 cb->compressed_pages[0]->mapping = cb->inode->i_mapping; 341 btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], 342 cb->start, cb->start + cb->len - 1, 343 !cb->errors); 344 cb->compressed_pages[0]->mapping = NULL; 345 346 end_compressed_writeback(inode, cb); 347 /* note, our inode could be gone now */ 348 349 /* 350 * release the compressed pages, these came from alloc_page and 351 * are not attached to the inode at all 352 */ 353 index = 0; 354 for (index = 0; index < cb->nr_pages; index++) { 355 page = cb->compressed_pages[index]; 356 page->mapping = NULL; 357 put_page(page); 358 } 359 360 /* finally free the cb struct */ 361 kfree(cb->compressed_pages); 362 kfree(cb); 363out: 364 bio_put(bio); 365} 366 367/* 368 * worker function to build and submit bios for previously compressed pages. 369 * The corresponding pages in the inode should be marked for writeback 370 * and the compressed pages should have a reference on them for dropping 371 * when the IO is complete. 372 * 373 * This also checksums the file bytes and gets things ready for 374 * the end io hooks. 375 */ 376blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, 377 unsigned long len, u64 disk_start, 378 unsigned long compressed_len, 379 struct page **compressed_pages, 380 unsigned long nr_pages, 381 unsigned int write_flags, 382 struct cgroup_subsys_state *blkcg_css) 383{ 384 struct btrfs_fs_info *fs_info = inode->root->fs_info; 385 struct bio *bio = NULL; 386 struct compressed_bio *cb; 387 unsigned long bytes_left; 388 int pg_index = 0; 389 struct page *page; 390 u64 first_byte = disk_start; 391 blk_status_t ret; 392 int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; 393 394 WARN_ON(!PAGE_ALIGNED(start)); 395 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 396 if (!cb) 397 return BLK_STS_RESOURCE; 398 refcount_set(&cb->pending_bios, 0); 399 cb->errors = 0; 400 cb->inode = &inode->vfs_inode; 401 cb->start = start; 402 cb->len = len; 403 cb->mirror_num = 0; 404 cb->compressed_pages = compressed_pages; 405 cb->compressed_len = compressed_len; 406 cb->orig_bio = NULL; 407 cb->nr_pages = nr_pages; 408 409 bio = btrfs_bio_alloc(first_byte); 410 bio->bi_opf = REQ_OP_WRITE | write_flags; 411 bio->bi_private = cb; 412 bio->bi_end_io = end_compressed_bio_write; 413 414 if (blkcg_css) { 415 bio->bi_opf |= REQ_CGROUP_PUNT; 416 kthread_associate_blkcg(blkcg_css); 417 } 418 refcount_set(&cb->pending_bios, 1); 419 420 /* create and submit bios for the compressed pages */ 421 bytes_left = compressed_len; 422 for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { 423 int submit = 0; 424 425 page = compressed_pages[pg_index]; 426 page->mapping = inode->vfs_inode.i_mapping; 427 if (bio->bi_iter.bi_size) 428 submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 429 0); 430 431 page->mapping = NULL; 432 if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < 433 PAGE_SIZE) { 434 /* 435 * inc the count before we submit the bio so 436 * we know the end IO handler won't happen before 437 * we inc the count. Otherwise, the cb might get 438 * freed before we're done setting it up 439 */ 440 refcount_inc(&cb->pending_bios); 441 ret = btrfs_bio_wq_end_io(fs_info, bio, 442 BTRFS_WQ_ENDIO_DATA); 443 BUG_ON(ret); /* -ENOMEM */ 444 445 if (!skip_sum) { 446 ret = btrfs_csum_one_bio(inode, bio, start, 1); 447 BUG_ON(ret); /* -ENOMEM */ 448 } 449 450 ret = btrfs_map_bio(fs_info, bio, 0); 451 if (ret) { 452 bio->bi_status = ret; 453 bio_endio(bio); 454 } 455 456 bio = btrfs_bio_alloc(first_byte); 457 bio->bi_opf = REQ_OP_WRITE | write_flags; 458 bio->bi_private = cb; 459 bio->bi_end_io = end_compressed_bio_write; 460 if (blkcg_css) 461 bio->bi_opf |= REQ_CGROUP_PUNT; 462 bio_add_page(bio, page, PAGE_SIZE, 0); 463 } 464 if (bytes_left < PAGE_SIZE) { 465 btrfs_info(fs_info, 466 "bytes left %lu compress len %lu nr %lu", 467 bytes_left, cb->compressed_len, cb->nr_pages); 468 } 469 bytes_left -= PAGE_SIZE; 470 first_byte += PAGE_SIZE; 471 cond_resched(); 472 } 473 474 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 475 BUG_ON(ret); /* -ENOMEM */ 476 477 if (!skip_sum) { 478 ret = btrfs_csum_one_bio(inode, bio, start, 1); 479 BUG_ON(ret); /* -ENOMEM */ 480 } 481 482 ret = btrfs_map_bio(fs_info, bio, 0); 483 if (ret) { 484 bio->bi_status = ret; 485 bio_endio(bio); 486 } 487 488 if (blkcg_css) 489 kthread_associate_blkcg(NULL); 490 491 return 0; 492} 493 494static u64 bio_end_offset(struct bio *bio) 495{ 496 struct bio_vec *last = bio_last_bvec_all(bio); 497 498 return page_offset(last->bv_page) + last->bv_len + last->bv_offset; 499} 500 501static noinline int add_ra_bio_pages(struct inode *inode, 502 u64 compressed_end, 503 struct compressed_bio *cb) 504{ 505 unsigned long end_index; 506 unsigned long pg_index; 507 u64 last_offset; 508 u64 isize = i_size_read(inode); 509 int ret; 510 struct page *page; 511 unsigned long nr_pages = 0; 512 struct extent_map *em; 513 struct address_space *mapping = inode->i_mapping; 514 struct extent_map_tree *em_tree; 515 struct extent_io_tree *tree; 516 u64 end; 517 int misses = 0; 518 519 last_offset = bio_end_offset(cb->orig_bio); 520 em_tree = &BTRFS_I(inode)->extent_tree; 521 tree = &BTRFS_I(inode)->io_tree; 522 523 if (isize == 0) 524 return 0; 525 526 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 527 528 while (last_offset < compressed_end) { 529 pg_index = last_offset >> PAGE_SHIFT; 530 531 if (pg_index > end_index) 532 break; 533 534 page = xa_load(&mapping->i_pages, pg_index); 535 if (page && !xa_is_value(page)) { 536 misses++; 537 if (misses > 4) 538 break; 539 goto next; 540 } 541 542 page = __page_cache_alloc(mapping_gfp_constraint(mapping, 543 ~__GFP_FS)); 544 if (!page) 545 break; 546 547 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { 548 put_page(page); 549 goto next; 550 } 551 552 end = last_offset + PAGE_SIZE - 1; 553 /* 554 * at this point, we have a locked page in the page cache 555 * for these bytes in the file. But, we have to make 556 * sure they map to this compressed extent on disk. 557 */ 558 set_page_extent_mapped(page); 559 lock_extent(tree, last_offset, end); 560 read_lock(&em_tree->lock); 561 em = lookup_extent_mapping(em_tree, last_offset, 562 PAGE_SIZE); 563 read_unlock(&em_tree->lock); 564 565 if (!em || last_offset < em->start || 566 (last_offset + PAGE_SIZE > extent_map_end(em)) || 567 (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { 568 free_extent_map(em); 569 unlock_extent(tree, last_offset, end); 570 unlock_page(page); 571 put_page(page); 572 break; 573 } 574 free_extent_map(em); 575 576 if (page->index == end_index) { 577 char *userpage; 578 size_t zero_offset = offset_in_page(isize); 579 580 if (zero_offset) { 581 int zeros; 582 zeros = PAGE_SIZE - zero_offset; 583 userpage = kmap_atomic(page); 584 memset(userpage + zero_offset, 0, zeros); 585 flush_dcache_page(page); 586 kunmap_atomic(userpage); 587 } 588 } 589 590 ret = bio_add_page(cb->orig_bio, page, 591 PAGE_SIZE, 0); 592 593 if (ret == PAGE_SIZE) { 594 nr_pages++; 595 put_page(page); 596 } else { 597 unlock_extent(tree, last_offset, end); 598 unlock_page(page); 599 put_page(page); 600 break; 601 } 602next: 603 last_offset += PAGE_SIZE; 604 } 605 return 0; 606} 607 608/* 609 * for a compressed read, the bio we get passed has all the inode pages 610 * in it. We don't actually do IO on those pages but allocate new ones 611 * to hold the compressed pages on disk. 612 * 613 * bio->bi_iter.bi_sector points to the compressed extent on disk 614 * bio->bi_io_vec points to all of the inode pages 615 * 616 * After the compressed pages are read, we copy the bytes into the 617 * bio we were passed and then call the bio end_io calls 618 */ 619blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 620 int mirror_num, unsigned long bio_flags) 621{ 622 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 623 struct extent_map_tree *em_tree; 624 struct compressed_bio *cb; 625 unsigned long compressed_len; 626 unsigned long nr_pages; 627 unsigned long pg_index; 628 struct page *page; 629 struct bio *comp_bio; 630 u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; 631 u64 em_len; 632 u64 em_start; 633 struct extent_map *em; 634 blk_status_t ret = BLK_STS_RESOURCE; 635 int faili = 0; 636 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 637 u8 *sums; 638 639 em_tree = &BTRFS_I(inode)->extent_tree; 640 641 /* we need the actual starting offset of this extent in the file */ 642 read_lock(&em_tree->lock); 643 em = lookup_extent_mapping(em_tree, 644 page_offset(bio_first_page_all(bio)), 645 PAGE_SIZE); 646 read_unlock(&em_tree->lock); 647 if (!em) 648 return BLK_STS_IOERR; 649 650 compressed_len = em->block_len; 651 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 652 if (!cb) 653 goto out; 654 655 refcount_set(&cb->pending_bios, 0); 656 cb->errors = 0; 657 cb->inode = inode; 658 cb->mirror_num = mirror_num; 659 sums = cb->sums; 660 661 cb->start = em->orig_start; 662 em_len = em->len; 663 em_start = em->start; 664 665 free_extent_map(em); 666 em = NULL; 667 668 cb->len = bio->bi_iter.bi_size; 669 cb->compressed_len = compressed_len; 670 cb->compress_type = extent_compress_type(bio_flags); 671 cb->orig_bio = bio; 672 673 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 674 cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), 675 GFP_NOFS); 676 if (!cb->compressed_pages) 677 goto fail1; 678 679 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 680 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | 681 __GFP_HIGHMEM); 682 if (!cb->compressed_pages[pg_index]) { 683 faili = pg_index - 1; 684 ret = BLK_STS_RESOURCE; 685 goto fail2; 686 } 687 } 688 faili = nr_pages - 1; 689 cb->nr_pages = nr_pages; 690 691 add_ra_bio_pages(inode, em_start + em_len, cb); 692 693 /* include any pages we added in add_ra-bio_pages */ 694 cb->len = bio->bi_iter.bi_size; 695 696 comp_bio = btrfs_bio_alloc(cur_disk_byte); 697 comp_bio->bi_opf = REQ_OP_READ; 698 comp_bio->bi_private = cb; 699 comp_bio->bi_end_io = end_compressed_bio_read; 700 refcount_set(&cb->pending_bios, 1); 701 702 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 703 int submit = 0; 704 705 page = cb->compressed_pages[pg_index]; 706 page->mapping = inode->i_mapping; 707 page->index = em_start >> PAGE_SHIFT; 708 709 if (comp_bio->bi_iter.bi_size) 710 submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, 711 comp_bio, 0); 712 713 page->mapping = NULL; 714 if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < 715 PAGE_SIZE) { 716 unsigned int nr_sectors; 717 718 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, 719 BTRFS_WQ_ENDIO_DATA); 720 BUG_ON(ret); /* -ENOMEM */ 721 722 /* 723 * inc the count before we submit the bio so 724 * we know the end IO handler won't happen before 725 * we inc the count. Otherwise, the cb might get 726 * freed before we're done setting it up 727 */ 728 refcount_inc(&cb->pending_bios); 729 730 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 731 ret = btrfs_lookup_bio_sums(inode, comp_bio, 732 (u64)-1, sums); 733 BUG_ON(ret); /* -ENOMEM */ 734 } 735 736 nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, 737 fs_info->sectorsize); 738 sums += csum_size * nr_sectors; 739 740 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); 741 if (ret) { 742 comp_bio->bi_status = ret; 743 bio_endio(comp_bio); 744 } 745 746 comp_bio = btrfs_bio_alloc(cur_disk_byte); 747 comp_bio->bi_opf = REQ_OP_READ; 748 comp_bio->bi_private = cb; 749 comp_bio->bi_end_io = end_compressed_bio_read; 750 751 bio_add_page(comp_bio, page, PAGE_SIZE, 0); 752 } 753 cur_disk_byte += PAGE_SIZE; 754 } 755 756 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); 757 BUG_ON(ret); /* -ENOMEM */ 758 759 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 760 ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); 761 BUG_ON(ret); /* -ENOMEM */ 762 } 763 764 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); 765 if (ret) { 766 comp_bio->bi_status = ret; 767 bio_endio(comp_bio); 768 } 769 770 return 0; 771 772fail2: 773 while (faili >= 0) { 774 __free_page(cb->compressed_pages[faili]); 775 faili--; 776 } 777 778 kfree(cb->compressed_pages); 779fail1: 780 kfree(cb); 781out: 782 free_extent_map(em); 783 return ret; 784} 785 786/* 787 * Heuristic uses systematic sampling to collect data from the input data 788 * range, the logic can be tuned by the following constants: 789 * 790 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample 791 * @SAMPLING_INTERVAL - range from which the sampled data can be collected 792 */ 793#define SAMPLING_READ_SIZE (16) 794#define SAMPLING_INTERVAL (256) 795 796/* 797 * For statistical analysis of the input data we consider bytes that form a 798 * Galois Field of 256 objects. Each object has an attribute count, ie. how 799 * many times the object appeared in the sample. 800 */ 801#define BUCKET_SIZE (256) 802 803/* 804 * The size of the sample is based on a statistical sampling rule of thumb. 805 * The common way is to perform sampling tests as long as the number of 806 * elements in each cell is at least 5. 807 * 808 * Instead of 5, we choose 32 to obtain more accurate results. 809 * If the data contain the maximum number of symbols, which is 256, we obtain a 810 * sample size bound by 8192. 811 * 812 * For a sample of at most 8KB of data per data range: 16 consecutive bytes 813 * from up to 512 locations. 814 */ 815#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ 816 SAMPLING_READ_SIZE / SAMPLING_INTERVAL) 817 818struct bucket_item { 819 u32 count; 820}; 821 822struct heuristic_ws { 823 /* Partial copy of input data */ 824 u8 *sample; 825 u32 sample_size; 826 /* Buckets store counters for each byte value */ 827 struct bucket_item *bucket; 828 /* Sorting buffer */ 829 struct bucket_item *bucket_b; 830 struct list_head list; 831}; 832 833static struct workspace_manager heuristic_wsm; 834 835static void free_heuristic_ws(struct list_head *ws) 836{ 837 struct heuristic_ws *workspace; 838 839 workspace = list_entry(ws, struct heuristic_ws, list); 840 841 kvfree(workspace->sample); 842 kfree(workspace->bucket); 843 kfree(workspace->bucket_b); 844 kfree(workspace); 845} 846 847static struct list_head *alloc_heuristic_ws(unsigned int level) 848{ 849 struct heuristic_ws *ws; 850 851 ws = kzalloc(sizeof(*ws), GFP_KERNEL); 852 if (!ws) 853 return ERR_PTR(-ENOMEM); 854 855 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); 856 if (!ws->sample) 857 goto fail; 858 859 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); 860 if (!ws->bucket) 861 goto fail; 862 863 ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); 864 if (!ws->bucket_b) 865 goto fail; 866 867 INIT_LIST_HEAD(&ws->list); 868 return &ws->list; 869fail: 870 free_heuristic_ws(&ws->list); 871 return ERR_PTR(-ENOMEM); 872} 873 874const struct btrfs_compress_op btrfs_heuristic_compress = { 875 .workspace_manager = &heuristic_wsm, 876}; 877 878static const struct btrfs_compress_op * const btrfs_compress_op[] = { 879 /* The heuristic is represented as compression type 0 */ 880 &btrfs_heuristic_compress, 881 &btrfs_zlib_compress, 882 &btrfs_lzo_compress, 883 &btrfs_zstd_compress, 884}; 885 886static struct list_head *alloc_workspace(int type, unsigned int level) 887{ 888 switch (type) { 889 case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); 890 case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); 891 case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); 892 case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); 893 default: 894 /* 895 * This can't happen, the type is validated several times 896 * before we get here. 897 */ 898 BUG(); 899 } 900} 901 902static void free_workspace(int type, struct list_head *ws) 903{ 904 switch (type) { 905 case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); 906 case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); 907 case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); 908 case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); 909 default: 910 /* 911 * This can't happen, the type is validated several times 912 * before we get here. 913 */ 914 BUG(); 915 } 916} 917 918static void btrfs_init_workspace_manager(int type) 919{ 920 struct workspace_manager *wsm; 921 struct list_head *workspace; 922 923 wsm = btrfs_compress_op[type]->workspace_manager; 924 INIT_LIST_HEAD(&wsm->idle_ws); 925 spin_lock_init(&wsm->ws_lock); 926 atomic_set(&wsm->total_ws, 0); 927 init_waitqueue_head(&wsm->ws_wait); 928 929 /* 930 * Preallocate one workspace for each compression type so we can 931 * guarantee forward progress in the worst case 932 */ 933 workspace = alloc_workspace(type, 0); 934 if (IS_ERR(workspace)) { 935 pr_warn( 936 "BTRFS: cannot preallocate compression workspace, will try later\n"); 937 } else { 938 atomic_set(&wsm->total_ws, 1); 939 wsm->free_ws = 1; 940 list_add(workspace, &wsm->idle_ws); 941 } 942} 943 944static void btrfs_cleanup_workspace_manager(int type) 945{ 946 struct workspace_manager *wsman; 947 struct list_head *ws; 948 949 wsman = btrfs_compress_op[type]->workspace_manager; 950 while (!list_empty(&wsman->idle_ws)) { 951 ws = wsman->idle_ws.next; 952 list_del(ws); 953 free_workspace(type, ws); 954 atomic_dec(&wsman->total_ws); 955 } 956} 957 958/* 959 * This finds an available workspace or allocates a new one. 960 * If it's not possible to allocate a new one, waits until there's one. 961 * Preallocation makes a forward progress guarantees and we do not return 962 * errors. 963 */ 964struct list_head *btrfs_get_workspace(int type, unsigned int level) 965{ 966 struct workspace_manager *wsm; 967 struct list_head *workspace; 968 int cpus = num_online_cpus(); 969 unsigned nofs_flag; 970 struct list_head *idle_ws; 971 spinlock_t *ws_lock; 972 atomic_t *total_ws; 973 wait_queue_head_t *ws_wait; 974 int *free_ws; 975 976 wsm = btrfs_compress_op[type]->workspace_manager; 977 idle_ws = &wsm->idle_ws; 978 ws_lock = &wsm->ws_lock; 979 total_ws = &wsm->total_ws; 980 ws_wait = &wsm->ws_wait; 981 free_ws = &wsm->free_ws; 982 983again: 984 spin_lock(ws_lock); 985 if (!list_empty(idle_ws)) { 986 workspace = idle_ws->next; 987 list_del(workspace); 988 (*free_ws)--; 989 spin_unlock(ws_lock); 990 return workspace; 991 992 } 993 if (atomic_read(total_ws) > cpus) { 994 DEFINE_WAIT(wait); 995 996 spin_unlock(ws_lock); 997 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); 998 if (atomic_read(total_ws) > cpus && !*free_ws) 999 schedule(); 1000 finish_wait(ws_wait, &wait); 1001 goto again; 1002 } 1003 atomic_inc(total_ws); 1004 spin_unlock(ws_lock); 1005 1006 /* 1007 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have 1008 * to turn it off here because we might get called from the restricted 1009 * context of btrfs_compress_bio/btrfs_compress_pages 1010 */ 1011 nofs_flag = memalloc_nofs_save(); 1012 workspace = alloc_workspace(type, level); 1013 memalloc_nofs_restore(nofs_flag); 1014 1015 if (IS_ERR(workspace)) { 1016 atomic_dec(total_ws); 1017 wake_up(ws_wait); 1018 1019 /* 1020 * Do not return the error but go back to waiting. There's a 1021 * workspace preallocated for each type and the compression 1022 * time is bounded so we get to a workspace eventually. This 1023 * makes our caller's life easier. 1024 * 1025 * To prevent silent and low-probability deadlocks (when the 1026 * initial preallocation fails), check if there are any 1027 * workspaces at all. 1028 */ 1029 if (atomic_read(total_ws) == 0) { 1030 static DEFINE_RATELIMIT_STATE(_rs, 1031 /* once per minute */ 60 * HZ, 1032 /* no burst */ 1); 1033 1034 if (__ratelimit(&_rs)) { 1035 pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); 1036 } 1037 } 1038 goto again; 1039 } 1040 return workspace; 1041} 1042 1043static struct list_head *get_workspace(int type, int level) 1044{ 1045 switch (type) { 1046 case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); 1047 case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); 1048 case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); 1049 case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); 1050 default: 1051 /* 1052 * This can't happen, the type is validated several times 1053 * before we get here. 1054 */ 1055 BUG(); 1056 } 1057} 1058 1059/* 1060 * put a workspace struct back on the list or free it if we have enough 1061 * idle ones sitting around 1062 */ 1063void btrfs_put_workspace(int type, struct list_head *ws) 1064{ 1065 struct workspace_manager *wsm; 1066 struct list_head *idle_ws; 1067 spinlock_t *ws_lock; 1068 atomic_t *total_ws; 1069 wait_queue_head_t *ws_wait; 1070 int *free_ws; 1071 1072 wsm = btrfs_compress_op[type]->workspace_manager; 1073 idle_ws = &wsm->idle_ws; 1074 ws_lock = &wsm->ws_lock; 1075 total_ws = &wsm->total_ws; 1076 ws_wait = &wsm->ws_wait; 1077 free_ws = &wsm->free_ws; 1078 1079 spin_lock(ws_lock); 1080 if (*free_ws <= num_online_cpus()) { 1081 list_add(ws, idle_ws); 1082 (*free_ws)++; 1083 spin_unlock(ws_lock); 1084 goto wake; 1085 } 1086 spin_unlock(ws_lock); 1087 1088 free_workspace(type, ws); 1089 atomic_dec(total_ws); 1090wake: 1091 cond_wake_up(ws_wait); 1092} 1093 1094static void put_workspace(int type, struct list_head *ws) 1095{ 1096 switch (type) { 1097 case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); 1098 case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); 1099 case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); 1100 case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); 1101 default: 1102 /* 1103 * This can't happen, the type is validated several times 1104 * before we get here. 1105 */ 1106 BUG(); 1107 } 1108} 1109 1110/* 1111 * Adjust @level according to the limits of the compression algorithm or 1112 * fallback to default 1113 */ 1114static unsigned int btrfs_compress_set_level(int type, unsigned level) 1115{ 1116 const struct btrfs_compress_op *ops = btrfs_compress_op[type]; 1117 1118 if (level == 0) 1119 level = ops->default_level; 1120 else 1121 level = min(level, ops->max_level); 1122 1123 return level; 1124} 1125 1126/* 1127 * Given an address space and start and length, compress the bytes into @pages 1128 * that are allocated on demand. 1129 * 1130 * @type_level is encoded algorithm and level, where level 0 means whatever 1131 * default the algorithm chooses and is opaque here; 1132 * - compression algo are 0-3 1133 * - the level are bits 4-7 1134 * 1135 * @out_pages is an in/out parameter, holds maximum number of pages to allocate 1136 * and returns number of actually allocated pages 1137 * 1138 * @total_in is used to return the number of bytes actually read. It 1139 * may be smaller than the input length if we had to exit early because we 1140 * ran out of room in the pages array or because we cross the 1141 * max_out threshold. 1142 * 1143 * @total_out is an in/out parameter, must be set to the input length and will 1144 * be also used to return the total number of compressed bytes 1145 * 1146 * @max_out tells us the max number of bytes that we're allowed to 1147 * stuff into pages 1148 */ 1149int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 1150 u64 start, struct page **pages, 1151 unsigned long *out_pages, 1152 unsigned long *total_in, 1153 unsigned long *total_out) 1154{ 1155 int type = btrfs_compress_type(type_level); 1156 int level = btrfs_compress_level(type_level); 1157 struct list_head *workspace; 1158 int ret; 1159 1160 level = btrfs_compress_set_level(type, level); 1161 workspace = get_workspace(type, level); 1162 ret = compression_compress_pages(type, workspace, mapping, start, pages, 1163 out_pages, total_in, total_out); 1164 put_workspace(type, workspace); 1165 return ret; 1166} 1167 1168/* 1169 * pages_in is an array of pages with compressed data. 1170 * 1171 * disk_start is the starting logical offset of this array in the file 1172 * 1173 * orig_bio contains the pages from the file that we want to decompress into 1174 * 1175 * srclen is the number of bytes in pages_in 1176 * 1177 * The basic idea is that we have a bio that was created by readpages. 1178 * The pages in the bio are for the uncompressed data, and they may not 1179 * be contiguous. They all correspond to the range of bytes covered by 1180 * the compressed extent. 1181 */ 1182static int btrfs_decompress_bio(struct compressed_bio *cb) 1183{ 1184 struct list_head *workspace; 1185 int ret; 1186 int type = cb->compress_type; 1187 1188 workspace = get_workspace(type, 0); 1189 ret = compression_decompress_bio(type, workspace, cb); 1190 put_workspace(type, workspace); 1191 1192 return ret; 1193} 1194 1195/* 1196 * a less complex decompression routine. Our compressed data fits in a 1197 * single page, and we want to read a single page out of it. 1198 * start_byte tells us the offset into the compressed data we're interested in 1199 */ 1200int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, 1201 unsigned long start_byte, size_t srclen, size_t destlen) 1202{ 1203 struct list_head *workspace; 1204 int ret; 1205 1206 workspace = get_workspace(type, 0); 1207 ret = compression_decompress(type, workspace, data_in, dest_page, 1208 start_byte, srclen, destlen); 1209 put_workspace(type, workspace); 1210 1211 return ret; 1212} 1213 1214void __init btrfs_init_compress(void) 1215{ 1216 btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); 1217 btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); 1218 btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); 1219 zstd_init_workspace_manager(); 1220} 1221 1222void __cold btrfs_exit_compress(void) 1223{ 1224 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); 1225 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); 1226 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); 1227 zstd_cleanup_workspace_manager(); 1228} 1229 1230/* 1231 * Copy uncompressed data from working buffer to pages. 1232 * 1233 * buf_start is the byte offset we're of the start of our workspace buffer. 1234 * 1235 * total_out is the last byte of the buffer 1236 */ 1237int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, 1238 unsigned long total_out, u64 disk_start, 1239 struct bio *bio) 1240{ 1241 unsigned long buf_offset; 1242 unsigned long current_buf_start; 1243 unsigned long start_byte; 1244 unsigned long prev_start_byte; 1245 unsigned long working_bytes = total_out - buf_start; 1246 unsigned long bytes; 1247 char *kaddr; 1248 struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); 1249 1250 /* 1251 * start byte is the first byte of the page we're currently 1252 * copying into relative to the start of the compressed data. 1253 */ 1254 start_byte = page_offset(bvec.bv_page) - disk_start; 1255 1256 /* we haven't yet hit data corresponding to this page */ 1257 if (total_out <= start_byte) 1258 return 1; 1259 1260 /* 1261 * the start of the data we care about is offset into 1262 * the middle of our working buffer 1263 */ 1264 if (total_out > start_byte && buf_start < start_byte) { 1265 buf_offset = start_byte - buf_start; 1266 working_bytes -= buf_offset; 1267 } else { 1268 buf_offset = 0; 1269 } 1270 current_buf_start = buf_start; 1271 1272 /* copy bytes from the working buffer into the pages */ 1273 while (working_bytes > 0) { 1274 bytes = min_t(unsigned long, bvec.bv_len, 1275 PAGE_SIZE - (buf_offset % PAGE_SIZE)); 1276 bytes = min(bytes, working_bytes); 1277 1278 kaddr = kmap_atomic(bvec.bv_page); 1279 memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes); 1280 kunmap_atomic(kaddr); 1281 flush_dcache_page(bvec.bv_page); 1282 1283 buf_offset += bytes; 1284 working_bytes -= bytes; 1285 current_buf_start += bytes; 1286 1287 /* check if we need to pick another page */ 1288 bio_advance(bio, bytes); 1289 if (!bio->bi_iter.bi_size) 1290 return 0; 1291 bvec = bio_iter_iovec(bio, bio->bi_iter); 1292 prev_start_byte = start_byte; 1293 start_byte = page_offset(bvec.bv_page) - disk_start; 1294 1295 /* 1296 * We need to make sure we're only adjusting 1297 * our offset into compression working buffer when 1298 * we're switching pages. Otherwise we can incorrectly 1299 * keep copying when we were actually done. 1300 */ 1301 if (start_byte != prev_start_byte) { 1302 /* 1303 * make sure our new page is covered by this 1304 * working buffer 1305 */ 1306 if (total_out <= start_byte) 1307 return 1; 1308 1309 /* 1310 * the next page in the biovec might not be adjacent 1311 * to the last page, but it might still be found 1312 * inside this working buffer. bump our offset pointer 1313 */ 1314 if (total_out > start_byte && 1315 current_buf_start < start_byte) { 1316 buf_offset = start_byte - buf_start; 1317 working_bytes = total_out - start_byte; 1318 current_buf_start = buf_start + buf_offset; 1319 } 1320 } 1321 } 1322 1323 return 1; 1324} 1325 1326/* 1327 * Shannon Entropy calculation 1328 * 1329 * Pure byte distribution analysis fails to determine compressibility of data. 1330 * Try calculating entropy to estimate the average minimum number of bits 1331 * needed to encode the sampled data. 1332 * 1333 * For convenience, return the percentage of needed bits, instead of amount of 1334 * bits directly. 1335 * 1336 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy 1337 * and can be compressible with high probability 1338 * 1339 * @ENTROPY_LVL_HIGH - data are not compressible with high probability 1340 * 1341 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. 1342 */ 1343#define ENTROPY_LVL_ACEPTABLE (65) 1344#define ENTROPY_LVL_HIGH (80) 1345 1346/* 1347 * For increasead precision in shannon_entropy calculation, 1348 * let's do pow(n, M) to save more digits after comma: 1349 * 1350 * - maximum int bit length is 64 1351 * - ilog2(MAX_SAMPLE_SIZE) -> 13 1352 * - 13 * 4 = 52 < 64 -> M = 4 1353 * 1354 * So use pow(n, 4). 1355 */ 1356static inline u32 ilog2_w(u64 n) 1357{ 1358 return ilog2(n * n * n * n); 1359} 1360 1361static u32 shannon_entropy(struct heuristic_ws *ws) 1362{ 1363 const u32 entropy_max = 8 * ilog2_w(2); 1364 u32 entropy_sum = 0; 1365 u32 p, p_base, sz_base; 1366 u32 i; 1367 1368 sz_base = ilog2_w(ws->sample_size); 1369 for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { 1370 p = ws->bucket[i].count; 1371 p_base = ilog2_w(p); 1372 entropy_sum += p * (sz_base - p_base); 1373 } 1374 1375 entropy_sum /= ws->sample_size; 1376 return entropy_sum * 100 / entropy_max; 1377} 1378 1379#define RADIX_BASE 4U 1380#define COUNTERS_SIZE (1U << RADIX_BASE) 1381 1382static u8 get4bits(u64 num, int shift) { 1383 u8 low4bits; 1384 1385 num >>= shift; 1386 /* Reverse order */ 1387 low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); 1388 return low4bits; 1389} 1390 1391/* 1392 * Use 4 bits as radix base 1393 * Use 16 u32 counters for calculating new position in buf array 1394 * 1395 * @array - array that will be sorted 1396 * @array_buf - buffer array to store sorting results 1397 * must be equal in size to @array 1398 * @num - array size 1399 */ 1400static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, 1401 int num) 1402{ 1403 u64 max_num; 1404 u64 buf_num; 1405 u32 counters[COUNTERS_SIZE]; 1406 u32 new_addr; 1407 u32 addr; 1408 int bitlen; 1409 int shift; 1410 int i; 1411 1412 /* 1413 * Try avoid useless loop iterations for small numbers stored in big 1414 * counters. Example: 48 33 4 ... in 64bit array 1415 */ 1416 max_num = array[0].count; 1417 for (i = 1; i < num; i++) { 1418 buf_num = array[i].count; 1419 if (buf_num > max_num) 1420 max_num = buf_num; 1421 } 1422 1423 buf_num = ilog2(max_num); 1424 bitlen = ALIGN(buf_num, RADIX_BASE * 2); 1425 1426 shift = 0; 1427 while (shift < bitlen) { 1428 memset(counters, 0, sizeof(counters)); 1429 1430 for (i = 0; i < num; i++) { 1431 buf_num = array[i].count; 1432 addr = get4bits(buf_num, shift); 1433 counters[addr]++; 1434 } 1435 1436 for (i = 1; i < COUNTERS_SIZE; i++) 1437 counters[i] += counters[i - 1]; 1438 1439 for (i = num - 1; i >= 0; i--) { 1440 buf_num = array[i].count; 1441 addr = get4bits(buf_num, shift); 1442 counters[addr]--; 1443 new_addr = counters[addr]; 1444 array_buf[new_addr] = array[i]; 1445 } 1446 1447 shift += RADIX_BASE; 1448 1449 /* 1450 * Normal radix expects to move data from a temporary array, to 1451 * the main one. But that requires some CPU time. Avoid that 1452 * by doing another sort iteration to original array instead of 1453 * memcpy() 1454 */ 1455 memset(counters, 0, sizeof(counters)); 1456 1457 for (i = 0; i < num; i ++) { 1458 buf_num = array_buf[i].count; 1459 addr = get4bits(buf_num, shift); 1460 counters[addr]++; 1461 } 1462 1463 for (i = 1; i < COUNTERS_SIZE; i++) 1464 counters[i] += counters[i - 1]; 1465 1466 for (i = num - 1; i >= 0; i--) { 1467 buf_num = array_buf[i].count; 1468 addr = get4bits(buf_num, shift); 1469 counters[addr]--; 1470 new_addr = counters[addr]; 1471 array[new_addr] = array_buf[i]; 1472 } 1473 1474 shift += RADIX_BASE; 1475 } 1476} 1477 1478/* 1479 * Size of the core byte set - how many bytes cover 90% of the sample 1480 * 1481 * There are several types of structured binary data that use nearly all byte 1482 * values. The distribution can be uniform and counts in all buckets will be 1483 * nearly the same (eg. encrypted data). Unlikely to be compressible. 1484 * 1485 * Other possibility is normal (Gaussian) distribution, where the data could 1486 * be potentially compressible, but we have to take a few more steps to decide 1487 * how much. 1488 * 1489 * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, 1490 * compression algo can easy fix that 1491 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high 1492 * probability is not compressible 1493 */ 1494#define BYTE_CORE_SET_LOW (64) 1495#define BYTE_CORE_SET_HIGH (200) 1496 1497static int byte_core_set_size(struct heuristic_ws *ws) 1498{ 1499 u32 i; 1500 u32 coreset_sum = 0; 1501 const u32 core_set_threshold = ws->sample_size * 90 / 100; 1502 struct bucket_item *bucket = ws->bucket; 1503 1504 /* Sort in reverse order */ 1505 radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); 1506 1507 for (i = 0; i < BYTE_CORE_SET_LOW; i++) 1508 coreset_sum += bucket[i].count; 1509 1510 if (coreset_sum > core_set_threshold) 1511 return i; 1512 1513 for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { 1514 coreset_sum += bucket[i].count; 1515 if (coreset_sum > core_set_threshold) 1516 break; 1517 } 1518 1519 return i; 1520} 1521 1522/* 1523 * Count byte values in buckets. 1524 * This heuristic can detect textual data (configs, xml, json, html, etc). 1525 * Because in most text-like data byte set is restricted to limited number of 1526 * possible characters, and that restriction in most cases makes data easy to 1527 * compress. 1528 * 1529 * @BYTE_SET_THRESHOLD - consider all data within this byte set size: 1530 * less - compressible 1531 * more - need additional analysis 1532 */ 1533#define BYTE_SET_THRESHOLD (64) 1534 1535static u32 byte_set_size(const struct heuristic_ws *ws) 1536{ 1537 u32 i; 1538 u32 byte_set_size = 0; 1539 1540 for (i = 0; i < BYTE_SET_THRESHOLD; i++) { 1541 if (ws->bucket[i].count > 0) 1542 byte_set_size++; 1543 } 1544 1545 /* 1546 * Continue collecting count of byte values in buckets. If the byte 1547 * set size is bigger then the threshold, it's pointless to continue, 1548 * the detection technique would fail for this type of data. 1549 */ 1550 for (; i < BUCKET_SIZE; i++) { 1551 if (ws->bucket[i].count > 0) { 1552 byte_set_size++; 1553 if (byte_set_size > BYTE_SET_THRESHOLD) 1554 return byte_set_size; 1555 } 1556 } 1557 1558 return byte_set_size; 1559} 1560 1561static bool sample_repeated_patterns(struct heuristic_ws *ws) 1562{ 1563 const u32 half_of_sample = ws->sample_size / 2; 1564 const u8 *data = ws->sample; 1565 1566 return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; 1567} 1568 1569static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, 1570 struct heuristic_ws *ws) 1571{ 1572 struct page *page; 1573 u64 index, index_end; 1574 u32 i, curr_sample_pos; 1575 u8 *in_data; 1576 1577 /* 1578 * Compression handles the input data by chunks of 128KiB 1579 * (defined by BTRFS_MAX_UNCOMPRESSED) 1580 * 1581 * We do the same for the heuristic and loop over the whole range. 1582 * 1583 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will 1584 * process no more than BTRFS_MAX_UNCOMPRESSED at a time. 1585 */ 1586 if (end - start > BTRFS_MAX_UNCOMPRESSED) 1587 end = start + BTRFS_MAX_UNCOMPRESSED; 1588 1589 index = start >> PAGE_SHIFT; 1590 index_end = end >> PAGE_SHIFT; 1591 1592 /* Don't miss unaligned end */ 1593 if (!IS_ALIGNED(end, PAGE_SIZE)) 1594 index_end++; 1595 1596 curr_sample_pos = 0; 1597 while (index < index_end) { 1598 page = find_get_page(inode->i_mapping, index); 1599 in_data = kmap(page); 1600 /* Handle case where the start is not aligned to PAGE_SIZE */ 1601 i = start % PAGE_SIZE; 1602 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { 1603 /* Don't sample any garbage from the last page */ 1604 if (start > end - SAMPLING_READ_SIZE) 1605 break; 1606 memcpy(&ws->sample[curr_sample_pos], &in_data[i], 1607 SAMPLING_READ_SIZE); 1608 i += SAMPLING_INTERVAL; 1609 start += SAMPLING_INTERVAL; 1610 curr_sample_pos += SAMPLING_READ_SIZE; 1611 } 1612 kunmap(page); 1613 put_page(page); 1614 1615 index++; 1616 } 1617 1618 ws->sample_size = curr_sample_pos; 1619} 1620 1621/* 1622 * Compression heuristic. 1623 * 1624 * For now is's a naive and optimistic 'return true', we'll extend the logic to 1625 * quickly (compared to direct compression) detect data characteristics 1626 * (compressible/uncompressible) to avoid wasting CPU time on uncompressible 1627 * data. 1628 * 1629 * The following types of analysis can be performed: 1630 * - detect mostly zero data 1631 * - detect data with low "byte set" size (text, etc) 1632 * - detect data with low/high "core byte" set 1633 * 1634 * Return non-zero if the compression should be done, 0 otherwise. 1635 */ 1636int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 1637{ 1638 struct list_head *ws_list = get_workspace(0, 0); 1639 struct heuristic_ws *ws; 1640 u32 i; 1641 u8 byte; 1642 int ret = 0; 1643 1644 ws = list_entry(ws_list, struct heuristic_ws, list); 1645 1646 heuristic_collect_sample(inode, start, end, ws); 1647 1648 if (sample_repeated_patterns(ws)) { 1649 ret = 1; 1650 goto out; 1651 } 1652 1653 memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); 1654 1655 for (i = 0; i < ws->sample_size; i++) { 1656 byte = ws->sample[i]; 1657 ws->bucket[byte].count++; 1658 } 1659 1660 i = byte_set_size(ws); 1661 if (i < BYTE_SET_THRESHOLD) { 1662 ret = 2; 1663 goto out; 1664 } 1665 1666 i = byte_core_set_size(ws); 1667 if (i <= BYTE_CORE_SET_LOW) { 1668 ret = 3; 1669 goto out; 1670 } 1671 1672 if (i >= BYTE_CORE_SET_HIGH) { 1673 ret = 0; 1674 goto out; 1675 } 1676 1677 i = shannon_entropy(ws); 1678 if (i <= ENTROPY_LVL_ACEPTABLE) { 1679 ret = 4; 1680 goto out; 1681 } 1682 1683 /* 1684 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be 1685 * needed to give green light to compression. 1686 * 1687 * For now just assume that compression at that level is not worth the 1688 * resources because: 1689 * 1690 * 1. it is possible to defrag the data later 1691 * 1692 * 2. the data would turn out to be hardly compressible, eg. 150 byte 1693 * values, every bucket has counter at level ~54. The heuristic would 1694 * be confused. This can happen when data have some internal repeated 1695 * patterns like "abbacbbc...". This can be detected by analyzing 1696 * pairs of bytes, which is too costly. 1697 */ 1698 if (i < ENTROPY_LVL_HIGH) { 1699 ret = 5; 1700 goto out; 1701 } else { 1702 ret = 0; 1703 goto out; 1704 } 1705 1706out: 1707 put_workspace(0, ws_list); 1708 return ret; 1709} 1710 1711/* 1712 * Convert the compression suffix (eg. after "zlib" starting with ":") to 1713 * level, unrecognized string will set the default level 1714 */ 1715unsigned int btrfs_compress_str2level(unsigned int type, const char *str) 1716{ 1717 unsigned int level = 0; 1718 int ret; 1719 1720 if (!type) 1721 return 0; 1722 1723 if (str[0] == ':') { 1724 ret = kstrtouint(str + 1, 10, &level); 1725 if (ret) 1726 level = 0; 1727 } 1728 1729 level = btrfs_compress_set_level(type, level); 1730 1731 return level; 1732} 1733