💾 Archived View for gmi.noulin.net › gitRepositories › sodiumTest › file › bloom.c.gmi captured on 2023-07-10 at 18:14:25. Gemini links have been rewritten to link to archived content
⬅️ Previous capture (2023-01-29)
-=-=-=-=-=-=-
bloom.c (7564B)
1 /* 2 * Copyright (c) 2012-2022, Jyri J. Virkki 3 * All rights reserved. 4 * 5 * This file is under BSD license. See LICENSE file. 6 */ 7 8 /* 9 * Refer to bloom.h for documentation on the public interfaces. 10 */ 11 12 #include <assert.h> 13 #include <fcntl.h> 14 #include <math.h> 15 #include <stdint.h> 16 #include <stdio.h> 17 #include <stdlib.h> 18 #include <string.h> 19 #include <sys/stat.h> 20 #include <sys/types.h> 21 #include <unistd.h> 22 23 #include "bloom.h" 24 #include "murmurhash2.h" 25 26 #define MAKESTRING(n) STRING(n) 27 #define STRING(n) #n 28 #define BLOOM_MAGIC "libbloom2" 29 30 inline static int test_bit_set_bit(unsigned char * buf, 31 unsigned long int bit, int set_bit) 32 { 33 unsigned long int byte = bit >> 3; 34 unsigned char c = buf[byte]; // expensive memory access 35 unsigned char mask = 1 << (bit % 8ul); 36 37 if (c & mask) { 38 return 1; 39 } else { 40 if (set_bit) { 41 buf[byte] = c | mask; 42 } 43 return 0; 44 } 45 } 46 47 48 static int bloom_check_add(struct bloom * bloom, 49 const void * buffer, int len, int add) 50 { 51 if (bloom->ready == 0) { 52 printf("bloom at %p not initialized!\n", (void *)bloom); 53 return -1; 54 } 55 56 unsigned char hits = 0; 57 unsigned int a = murmurhash2(buffer, len, 0x9747b28c); 58 unsigned int b = murmurhash2(buffer, len, a); 59 unsigned long int x; 60 unsigned long int i; 61 62 for (i = 0; i < bloom->hashes; i++) { 63 x = (a + b*i) % bloom->bits; 64 if (test_bit_set_bit(bloom->bf, x, add)) { 65 hits++; 66 } else if (!add) { 67 // Don't care about the presence of all the bits. Just our own. 68 return 0; 69 } 70 } 71 72 if (hits == bloom->hashes) { 73 return 1; // 1 == element already in (or collision) 74 } 75 76 return 0; 77 } 78 79 80 // DEPRECATED - Please migrate to bloom_init2. 81 int bloom_init(struct bloom * bloom, int entries, double error) 82 { 83 return bloom_init2(bloom, (unsigned int)entries, error); 84 } 85 86 87 int bloom_init2(struct bloom * bloom, unsigned int entries, double error) 88 { 89 if (sizeof(unsigned long int) < 8) { 90 printf("error: libbloom will not function correctly because\n"); 91 printf("sizeof(unsigned long int) == %ld\n", sizeof(unsigned long int)); 92 exit(1); 93 } 94 95 memset(bloom, 0, sizeof(struct bloom)); 96 97 if (entries < 1000 || error <= 0 || error >= 1) { 98 return 1; 99 } 100 101 bloom->entries = entries; 102 bloom->error = error; 103 104 double num = -log(bloom->error); 105 double denom = 0.480453013918201; // ln(2)^2 106 bloom->bpe = (num / denom); 107 108 long double dentries = (long double)entries; 109 long double allbits = dentries * bloom->bpe; 110 bloom->bits = (unsigned long int)allbits; 111 112 if (bloom->bits % 8) { 113 bloom->bytes = (bloom->bits / 8) + 1; 114 } else { 115 bloom->bytes = bloom->bits / 8; 116 } 117 118 bloom->hashes = (unsigned char)ceil(0.693147180559945 * bloom->bpe); // ln(2) 119 120 bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char)); 121 if (bloom->bf == NULL) { // LCOV_EXCL_START 122 return 1; 123 } // LCOV_EXCL_STOP 124 125 bloom->ready = 1; 126 127 bloom->major = BLOOM_VERSION_MAJOR; 128 bloom->minor = BLOOM_VERSION_MINOR; 129 130 return 0; 131 } 132 133 134 int bloom_check(struct bloom * bloom, const void * buffer, int len) 135 { 136 return bloom_check_add(bloom, buffer, len, 0); 137 } 138 139 140 int bloom_add(struct bloom * bloom, const void * buffer, int len) 141 { 142 return bloom_check_add(bloom, buffer, len, 1); 143 } 144 145 146 void bloom_print(struct bloom * bloom) 147 { 148 printf("bloom at %p\n", (void *)bloom); 149 if (!bloom->ready) { printf(" *** NOT READY ***\n"); } 150 printf(" ->version = %d.%d\n", bloom->major, bloom->minor); 151 printf(" ->entries = %u\n", bloom->entries); 152 printf(" ->error = %f\n", bloom->error); 153 printf(" ->bits = %lu\n", bloom->bits); 154 printf(" ->bits per elem = %f\n", bloom->bpe); 155 printf(" ->bytes = %lu", bloom->bytes); 156 unsigned int KB = bloom->bytes / 1024; 157 unsigned int MB = KB / 1024; 158 printf(" (%u KB, %u MB)\n", KB, MB); 159 printf(" ->hash functions = %d\n", bloom->hashes); 160 } 161 162 163 void bloom_free(struct bloom * bloom) 164 { 165 if (bloom->ready) { 166 free(bloom->bf); 167 } 168 bloom->ready = 0; 169 } 170 171 172 int bloom_reset(struct bloom * bloom) 173 { 174 if (!bloom->ready) return 1; 175 memset(bloom->bf, 0, bloom->bytes); 176 return 0; 177 } 178 179 180 int bloom_save(struct bloom * bloom, char * filename) 181 { 182 if (filename == NULL || filename[0] == 0) { 183 return 1; 184 } 185 186 int fd = open(filename, O_WRONLY | O_CREAT, 0644); 187 if (fd < 0) { 188 return 1; 189 } 190 191 ssize_t out = write(fd, BLOOM_MAGIC, strlen(BLOOM_MAGIC)); 192 if (out != strlen(BLOOM_MAGIC)) { goto save_error; } // LCOV_EXCL_LINE 193 194 uint16_t size = sizeof(struct bloom); 195 out = write(fd, &size, sizeof(uint16_t)); 196 if (out != sizeof(uint16_t)) { goto save_error; } // LCOV_EXCL_LINE 197 198 out = write(fd, bloom, sizeof(struct bloom)); 199 if (out != sizeof(struct bloom)) { goto save_error; } // LCOV_EXCL_LINE 200 201 out = write(fd, bloom->bf, bloom->bytes); 202 if (out != bloom->bytes) { goto save_error; } // LCOV_EXCL_LINE 203 204 close(fd); 205 return 0; 206 // LCOV_EXCL_START 207 save_error: 208 close(fd); 209 return 1; 210 // LCOV_EXCL_STOP 211 } 212 213 214 int bloom_load(struct bloom * bloom, char * filename) 215 { 216 int rv = 0; 217 218 if (filename == NULL || filename[0] == 0) { return 1; } 219 if (bloom == NULL) { return 2; } 220 221 memset(bloom, 0, sizeof(struct bloom)); 222 223 int fd = open(filename, O_RDONLY); 224 if (fd < 0) { return 3; } 225 226 char line[30]; 227 memset(line, 0, 30); 228 ssize_t in = read(fd, line, strlen(BLOOM_MAGIC)); 229 230 if (in != strlen(BLOOM_MAGIC)) { 231 rv = 4; 232 goto load_error; 233 } 234 235 if (strncmp(line, BLOOM_MAGIC, strlen(BLOOM_MAGIC))) { 236 rv = 5; 237 goto load_error; 238 } 239 240 uint16_t size; 241 in = read(fd, &size, sizeof(uint16_t)); 242 if (in != sizeof(uint16_t)) { 243 rv = 6; 244 goto load_error; 245 } 246 247 if (size != sizeof(struct bloom)) { 248 rv = 7; 249 goto load_error; 250 } 251 252 in = read(fd, bloom, sizeof(struct bloom)); 253 if (in != sizeof(struct bloom)) { 254 rv = 8; 255 goto load_error; 256 } 257 258 bloom->bf = NULL; 259 if (bloom->major != BLOOM_VERSION_MAJOR) { 260 rv = 9; 261 goto load_error; 262 } 263 264 bloom->bf = (unsigned char *)malloc(bloom->bytes); 265 if (bloom->bf == NULL) { rv = 10; goto load_error; } // LCOV_EXCL_LINE 266 267 in = read(fd, bloom->bf, bloom->bytes); 268 if (in != bloom->bytes) { 269 rv = 11; 270 free(bloom->bf); 271 bloom->bf = NULL; 272 goto load_error; 273 } 274 275 close(fd); 276 return rv; 277 278 load_error: 279 close(fd); 280 bloom->ready = 0; 281 return rv; 282 } 283 284 285 int bloom_merge(struct bloom * bloom_dest, struct bloom * bloom_src) 286 { 287 if (bloom_dest->ready == 0) { 288 printf("bloom at %p not initialized!\n", (void *)bloom_dest); 289 return -1; 290 } 291 292 if (bloom_src->ready == 0) { 293 printf("bloom at %p not initialized!\n", (void *)bloom_src); 294 return -1; 295 } 296 297 if (bloom_dest->entries != bloom_src->entries) { 298 return 1; 299 } 300 301 if (bloom_dest->error != bloom_src->error) { 302 return 1; 303 } 304 305 if (bloom_dest->major != bloom_src->major) { 306 return 1; 307 } 308 309 if (bloom_dest->minor != bloom_src->minor) { 310 return 1; 311 } 312 313 // Not really possible if properly used but check anyway to avoid the 314 // possibility of buffer overruns. 315 if (bloom_dest->bytes != bloom_src->bytes) { 316 return 1; // LCOV_EXCL_LINE 317 } 318 319 unsigned long int p; 320 for (p = 0; p < bloom_dest->bytes; p++) { 321 bloom_dest->bf[p] |= bloom_src->bf[p]; 322 } 323 324 return 0; 325 } 326 327 328 const char * bloom_version() 329 { 330 return MAKESTRING(BLOOM_VERSION); 331 }