18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * lib/ts_bm.c Boyer-Moore text search implementation 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Authors: Pablo Neira Ayuso <pablo@eurodev.net> 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * ========================================================================== 88c2ecf20Sopenharmony_ci * 98c2ecf20Sopenharmony_ci * Implements Boyer-Moore string matching algorithm: 108c2ecf20Sopenharmony_ci * 118c2ecf20Sopenharmony_ci * [1] A Fast String Searching Algorithm, R.S. Boyer and Moore. 128c2ecf20Sopenharmony_ci * Communications of the Association for Computing Machinery, 138c2ecf20Sopenharmony_ci * 20(10), 1977, pp. 762-772. 148c2ecf20Sopenharmony_ci * https://www.cs.utexas.edu/users/moore/publications/fstrpos.pdf 158c2ecf20Sopenharmony_ci * 168c2ecf20Sopenharmony_ci * [2] Handbook of Exact String Matching Algorithms, Thierry Lecroq, 2004 178c2ecf20Sopenharmony_ci * http://www-igm.univ-mlv.fr/~lecroq/string/string.pdf 188c2ecf20Sopenharmony_ci * 198c2ecf20Sopenharmony_ci * Note: Since Boyer-Moore (BM) performs searches for matchings from right 208c2ecf20Sopenharmony_ci * to left, it's still possible that a matching could be spread over 218c2ecf20Sopenharmony_ci * multiple blocks, in that case this algorithm won't find any coincidence. 228c2ecf20Sopenharmony_ci * 238c2ecf20Sopenharmony_ci * If you're willing to ensure that such thing won't ever happen, use the 248c2ecf20Sopenharmony_ci * Knuth-Pratt-Morris (KMP) implementation instead. In conclusion, choose 258c2ecf20Sopenharmony_ci * the proper string search algorithm depending on your setting. 268c2ecf20Sopenharmony_ci * 278c2ecf20Sopenharmony_ci * Say you're using the textsearch infrastructure for filtering, NIDS or 288c2ecf20Sopenharmony_ci * any similar security focused purpose, then go KMP. Otherwise, if you 298c2ecf20Sopenharmony_ci * really care about performance, say you're classifying packets to apply 308c2ecf20Sopenharmony_ci * Quality of Service (QoS) policies, and you don't mind about possible 318c2ecf20Sopenharmony_ci * matchings spread over multiple fragments, then go BM. 328c2ecf20Sopenharmony_ci */ 338c2ecf20Sopenharmony_ci 348c2ecf20Sopenharmony_ci#include <linux/kernel.h> 358c2ecf20Sopenharmony_ci#include <linux/module.h> 368c2ecf20Sopenharmony_ci#include <linux/types.h> 378c2ecf20Sopenharmony_ci#include <linux/string.h> 388c2ecf20Sopenharmony_ci#include <linux/ctype.h> 398c2ecf20Sopenharmony_ci#include <linux/textsearch.h> 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci/* Alphabet size, use ASCII */ 428c2ecf20Sopenharmony_ci#define ASIZE 256 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci#if 0 458c2ecf20Sopenharmony_ci#define DEBUGP printk 468c2ecf20Sopenharmony_ci#else 478c2ecf20Sopenharmony_ci#define DEBUGP(args, format...) 488c2ecf20Sopenharmony_ci#endif 498c2ecf20Sopenharmony_ci 508c2ecf20Sopenharmony_cistruct ts_bm 518c2ecf20Sopenharmony_ci{ 528c2ecf20Sopenharmony_ci u8 * pattern; 538c2ecf20Sopenharmony_ci unsigned int patlen; 548c2ecf20Sopenharmony_ci unsigned int bad_shift[ASIZE]; 558c2ecf20Sopenharmony_ci unsigned int good_shift[]; 568c2ecf20Sopenharmony_ci}; 578c2ecf20Sopenharmony_ci 588c2ecf20Sopenharmony_cistatic unsigned int bm_find(struct ts_config *conf, struct ts_state *state) 598c2ecf20Sopenharmony_ci{ 608c2ecf20Sopenharmony_ci struct ts_bm *bm = ts_config_priv(conf); 618c2ecf20Sopenharmony_ci unsigned int i, text_len, consumed = state->offset; 628c2ecf20Sopenharmony_ci const u8 *text; 638c2ecf20Sopenharmony_ci int bs; 648c2ecf20Sopenharmony_ci const u8 icase = conf->flags & TS_IGNORECASE; 658c2ecf20Sopenharmony_ci 668c2ecf20Sopenharmony_ci for (;;) { 678c2ecf20Sopenharmony_ci int shift = bm->patlen - 1; 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci text_len = conf->get_next_block(consumed, &text, conf, state); 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci if (unlikely(text_len == 0)) 728c2ecf20Sopenharmony_ci break; 738c2ecf20Sopenharmony_ci 748c2ecf20Sopenharmony_ci while (shift < text_len) { 758c2ecf20Sopenharmony_ci DEBUGP("Searching in position %d (%c)\n", 768c2ecf20Sopenharmony_ci shift, text[shift]); 778c2ecf20Sopenharmony_ci for (i = 0; i < bm->patlen; i++) 788c2ecf20Sopenharmony_ci if ((icase ? toupper(text[shift-i]) 798c2ecf20Sopenharmony_ci : text[shift-i]) 808c2ecf20Sopenharmony_ci != bm->pattern[bm->patlen-1-i]) 818c2ecf20Sopenharmony_ci goto next; 828c2ecf20Sopenharmony_ci 838c2ecf20Sopenharmony_ci /* London calling... */ 848c2ecf20Sopenharmony_ci DEBUGP("found!\n"); 858c2ecf20Sopenharmony_ci return consumed += (shift-(bm->patlen-1)); 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_cinext: bs = bm->bad_shift[text[shift-i]]; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci /* Now jumping to... */ 908c2ecf20Sopenharmony_ci shift = max_t(int, shift-i+bs, shift+bm->good_shift[i]); 918c2ecf20Sopenharmony_ci } 928c2ecf20Sopenharmony_ci consumed += text_len; 938c2ecf20Sopenharmony_ci } 948c2ecf20Sopenharmony_ci 958c2ecf20Sopenharmony_ci return UINT_MAX; 968c2ecf20Sopenharmony_ci} 978c2ecf20Sopenharmony_ci 988c2ecf20Sopenharmony_cistatic int subpattern(u8 *pattern, int i, int j, int g) 998c2ecf20Sopenharmony_ci{ 1008c2ecf20Sopenharmony_ci int x = i+g-1, y = j+g-1, ret = 0; 1018c2ecf20Sopenharmony_ci 1028c2ecf20Sopenharmony_ci while(pattern[x--] == pattern[y--]) { 1038c2ecf20Sopenharmony_ci if (y < 0) { 1048c2ecf20Sopenharmony_ci ret = 1; 1058c2ecf20Sopenharmony_ci break; 1068c2ecf20Sopenharmony_ci } 1078c2ecf20Sopenharmony_ci if (--g == 0) { 1088c2ecf20Sopenharmony_ci ret = pattern[i-1] != pattern[j-1]; 1098c2ecf20Sopenharmony_ci break; 1108c2ecf20Sopenharmony_ci } 1118c2ecf20Sopenharmony_ci } 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci return ret; 1148c2ecf20Sopenharmony_ci} 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_cistatic void compute_prefix_tbl(struct ts_bm *bm, int flags) 1178c2ecf20Sopenharmony_ci{ 1188c2ecf20Sopenharmony_ci int i, j, g; 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci for (i = 0; i < ASIZE; i++) 1218c2ecf20Sopenharmony_ci bm->bad_shift[i] = bm->patlen; 1228c2ecf20Sopenharmony_ci for (i = 0; i < bm->patlen - 1; i++) { 1238c2ecf20Sopenharmony_ci bm->bad_shift[bm->pattern[i]] = bm->patlen - 1 - i; 1248c2ecf20Sopenharmony_ci if (flags & TS_IGNORECASE) 1258c2ecf20Sopenharmony_ci bm->bad_shift[tolower(bm->pattern[i])] 1268c2ecf20Sopenharmony_ci = bm->patlen - 1 - i; 1278c2ecf20Sopenharmony_ci } 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_ci /* Compute the good shift array, used to match reocurrences 1308c2ecf20Sopenharmony_ci * of a subpattern */ 1318c2ecf20Sopenharmony_ci bm->good_shift[0] = 1; 1328c2ecf20Sopenharmony_ci for (i = 1; i < bm->patlen; i++) 1338c2ecf20Sopenharmony_ci bm->good_shift[i] = bm->patlen; 1348c2ecf20Sopenharmony_ci for (i = bm->patlen-1, g = 1; i > 0; g++, i--) { 1358c2ecf20Sopenharmony_ci for (j = i-1; j >= 1-g ; j--) 1368c2ecf20Sopenharmony_ci if (subpattern(bm->pattern, i, j, g)) { 1378c2ecf20Sopenharmony_ci bm->good_shift[g] = bm->patlen-j-g; 1388c2ecf20Sopenharmony_ci break; 1398c2ecf20Sopenharmony_ci } 1408c2ecf20Sopenharmony_ci } 1418c2ecf20Sopenharmony_ci} 1428c2ecf20Sopenharmony_ci 1438c2ecf20Sopenharmony_cistatic struct ts_config *bm_init(const void *pattern, unsigned int len, 1448c2ecf20Sopenharmony_ci gfp_t gfp_mask, int flags) 1458c2ecf20Sopenharmony_ci{ 1468c2ecf20Sopenharmony_ci struct ts_config *conf; 1478c2ecf20Sopenharmony_ci struct ts_bm *bm; 1488c2ecf20Sopenharmony_ci int i; 1498c2ecf20Sopenharmony_ci unsigned int prefix_tbl_len = len * sizeof(unsigned int); 1508c2ecf20Sopenharmony_ci size_t priv_size = sizeof(*bm) + len + prefix_tbl_len; 1518c2ecf20Sopenharmony_ci 1528c2ecf20Sopenharmony_ci conf = alloc_ts_config(priv_size, gfp_mask); 1538c2ecf20Sopenharmony_ci if (IS_ERR(conf)) 1548c2ecf20Sopenharmony_ci return conf; 1558c2ecf20Sopenharmony_ci 1568c2ecf20Sopenharmony_ci conf->flags = flags; 1578c2ecf20Sopenharmony_ci bm = ts_config_priv(conf); 1588c2ecf20Sopenharmony_ci bm->patlen = len; 1598c2ecf20Sopenharmony_ci bm->pattern = (u8 *) bm->good_shift + prefix_tbl_len; 1608c2ecf20Sopenharmony_ci if (flags & TS_IGNORECASE) 1618c2ecf20Sopenharmony_ci for (i = 0; i < len; i++) 1628c2ecf20Sopenharmony_ci bm->pattern[i] = toupper(((u8 *)pattern)[i]); 1638c2ecf20Sopenharmony_ci else 1648c2ecf20Sopenharmony_ci memcpy(bm->pattern, pattern, len); 1658c2ecf20Sopenharmony_ci compute_prefix_tbl(bm, flags); 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci return conf; 1688c2ecf20Sopenharmony_ci} 1698c2ecf20Sopenharmony_ci 1708c2ecf20Sopenharmony_cistatic void *bm_get_pattern(struct ts_config *conf) 1718c2ecf20Sopenharmony_ci{ 1728c2ecf20Sopenharmony_ci struct ts_bm *bm = ts_config_priv(conf); 1738c2ecf20Sopenharmony_ci return bm->pattern; 1748c2ecf20Sopenharmony_ci} 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_cistatic unsigned int bm_get_pattern_len(struct ts_config *conf) 1778c2ecf20Sopenharmony_ci{ 1788c2ecf20Sopenharmony_ci struct ts_bm *bm = ts_config_priv(conf); 1798c2ecf20Sopenharmony_ci return bm->patlen; 1808c2ecf20Sopenharmony_ci} 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_cistatic struct ts_ops bm_ops = { 1838c2ecf20Sopenharmony_ci .name = "bm", 1848c2ecf20Sopenharmony_ci .find = bm_find, 1858c2ecf20Sopenharmony_ci .init = bm_init, 1868c2ecf20Sopenharmony_ci .get_pattern = bm_get_pattern, 1878c2ecf20Sopenharmony_ci .get_pattern_len = bm_get_pattern_len, 1888c2ecf20Sopenharmony_ci .owner = THIS_MODULE, 1898c2ecf20Sopenharmony_ci .list = LIST_HEAD_INIT(bm_ops.list) 1908c2ecf20Sopenharmony_ci}; 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_cistatic int __init init_bm(void) 1938c2ecf20Sopenharmony_ci{ 1948c2ecf20Sopenharmony_ci return textsearch_register(&bm_ops); 1958c2ecf20Sopenharmony_ci} 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_cistatic void __exit exit_bm(void) 1988c2ecf20Sopenharmony_ci{ 1998c2ecf20Sopenharmony_ci textsearch_unregister(&bm_ops); 2008c2ecf20Sopenharmony_ci} 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 2038c2ecf20Sopenharmony_ci 2048c2ecf20Sopenharmony_cimodule_init(init_bm); 2058c2ecf20Sopenharmony_cimodule_exit(exit_bm); 206