From 09e609fb082755d5620b1fc55f4671e7b882ef75 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 18 Oct 2018 16:08:46 +0200 Subject: [PATCH] util: provide `git__memmem` function Unfortunately, neither the `memmem` nor the `strnstr` functions are part of any C standard but are merely extensions of C that are implemented by e.g. glibc. Thus, there is no standardized way to search for a string in a block of memory with a limited size, and using `strstr` is to be considered unsafe in case where the buffer has not been sanitized. In fact, there are some uses of `strstr` in exactly that unsafe way in our codebase. Provide a new function `git__memmem` that implements the `memmem` semantics. That is in a given haystack of `n` bytes, search for the occurrence of a byte sequence of `m` bytes and return a pointer to the first occurrence. The implementation chosen is the "Not So Naive" algorithm from [1]. It was chosen as the implementation is comparably simple while still being reasonably efficient in most cases. Preprocessing happens in constant time and space, searching has a time complexity of O(n*m) with a slightly sub-linear average case. [1]: http://www-igm.univ-mlv.fr/~lecroq/string/ (cherry picked from commit 83e8a6b36acc67f2702cbbc7d4e334c7f7737719) --- src/util.c | 41 ++++++++++++++++++++++++++++++++++++++++ src/util.h | 3 +++ tests/core/memmem.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 tests/core/memmem.c diff --git a/src/util.c b/src/util.c index 370ea5837..1e7de93c3 100644 --- a/src/util.c +++ b/src/util.c @@ -353,6 +353,47 @@ size_t git__linenlen(const char *buffer, size_t buffer_len) return nl ? (size_t)(nl - buffer) + 1 : buffer_len; } +/* + * Adapted Not So Naive algorithm from http://www-igm.univ-mlv.fr/~lecroq/string/ + */ +const void * git__memmem(const void *haystack, size_t haystacklen, + const void *needle, size_t needlelen) +{ + const char *h, *n; + size_t j, k, l; + + if (needlelen > haystacklen || !haystacklen || !needlelen) + return NULL; + + h = (const char *) haystack, + n = (const char *) needle; + + if (needlelen == 1) + return memchr(haystack, *n, haystacklen); + + if (n[0] == n[1]) { + k = 2; + l = 1; + } else { + k = 1; + l = 2; + } + + j = 0; + while (j <= haystacklen - needlelen) { + if (n[1] != h[j + 1]) { + j += k; + } else { + if (memcmp(n + 2, h + j + 2, needlelen - 2) == 0 && + n[0] == h[j]) + return h + j; + j += l; + } + } + + return NULL; +} + void git__hexdump(const char *buffer, size_t len) { static const size_t LINE_WIDTH = 16; diff --git a/src/util.h b/src/util.h index 9c76f0ec0..2ac118d8f 100644 --- a/src/util.h +++ b/src/util.h @@ -316,6 +316,9 @@ GIT_INLINE(const void *) git__memrchr(const void *s, int c, size_t n) return NULL; } +extern const void * git__memmem(const void *haystack, size_t haystacklen, + const void *needle, size_t needlelen); + typedef int (*git__tsort_cmp)(const void *a, const void *b); extern void git__tsort(void **dst, size_t size, git__tsort_cmp cmp); diff --git a/tests/core/memmem.c b/tests/core/memmem.c new file mode 100644 index 000000000..fd9986d01 --- /dev/null +++ b/tests/core/memmem.c @@ -0,0 +1,46 @@ +#include "clar_libgit2.h" + +static void assert_found(const char *haystack, const char *needle, size_t expected_pos) +{ + cl_assert_equal_p(git__memmem(haystack, haystack ? strlen(haystack) : 0, + needle, needle ? strlen(needle) : 0), + haystack + expected_pos); +} + +static void assert_absent(const char *haystack, const char *needle) +{ + cl_assert_equal_p(git__memmem(haystack, haystack ? strlen(haystack) : 0, + needle, needle ? strlen(needle) : 0), + NULL); +} + +void test_core_memmem__found(void) +{ + assert_found("a", "a", 0); + assert_found("ab", "a", 0); + assert_found("ba", "a", 1); + assert_found("aa", "a", 0); + assert_found("aab", "aa", 0); + assert_found("baa", "aa", 1); + assert_found("dabc", "abc", 1); + assert_found("abababc", "abc", 4); +} + +void test_core_memmem__absent(void) +{ + assert_absent("a", "b"); + assert_absent("a", "aa"); + assert_absent("ba", "ab"); + assert_absent("ba", "ab"); + assert_absent("abc", "abcd"); + assert_absent("abcabcabc", "bcac"); +} + +void test_core_memmem__edgecases(void) +{ + assert_absent(NULL, NULL); + assert_absent("a", NULL); + assert_absent(NULL, "a"); + assert_absent("", "a"); + assert_absent("a", ""); +}