💾 Archived View for gmi.noulin.net › gitRepositories › perfecthash › file › genHanov.c.gmi captured on 2024-09-29 at 01:15:32. Gemini links have been rewritten to link to archived content

View Raw

More Information

⬅️ Previous capture (2023-01-29)

-=-=-=-=-=-=-

perfecthash

Log

Files

Refs

README

LICENSE

genHanov.c (8309B)

     1 #! /usr/bin/env sheepy
     2 /* or direct path to sheepy: #! /usr/local/bin/sheepy */
     3 
     4 /**
     5  * simple C implementation of hanov's perfect hash: http://stevehanov.ca/blog/index.php?id=119
     6  * Minimal Perfect Hashing - Dr. Amjad M Daoud - http://iswsa.acm.org/mphf/index.html
     7  */
     8 
     9 /* Libsheepy documentation: http://spartatek.se/libsheepy/ */
    10 #include "libsheepyObject.h"
    11 #include "shpPackages/crc/crc.h"
    12 
    13 /* enable/disable logging */
    14 /* #undef pLog */
    15 /* #define pLog(...) */
    16 
    17 const char *type            = "hanovt";
    18 const char *perfectHashData = "perfectHash";
    19 const char *keysArray       = "keys";
    20 const char *funcScope       = "static inline";
    21 const char *hashFunc        = "hanovHash";
    22 const char *lookupFunc      = "hanovLookup";
    23 const char *findFunc        = "hanovFind";
    24 
    25 #define CFG_FILENAME "hanovConfig.yml"
    26 
    27 // column count in the perfectHashData and keysArray arrays
    28 #define colCount 4
    29 
    30 u32 hash(u32 d, const char *str) {
    31   if (!d) d = 0x811c9dc5;
    32   ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;
    33 }
    34 
    35 typ struct {smallArrayt *G; u32 size;} hanovt;
    36 
    37 hanovt create(const char **keys) {
    38   var size = lenG(keys);
    39   createSmallArray(buckets);
    40   loop(size) {pushG(&buckets, NULL);}
    41   hanovt res;
    42   res.G = allocG(rtSmallArrayt);
    43   loop(size) {pushG(res.G, NULL);}
    44   res.size = size;
    45 
    46   // Place all of the keys into buckets
    47   cast(char**, ks, keys);
    48   forEachS(ks, key) {
    49     var bkey   = hash(0, key) % size;
    50     var bucket = getG(&buckets, rtSmallArrayt, bkey);
    51     if (!bucket) {
    52       bucket = allocG(bucket);
    53     }
    54     pushG(bucket, key);
    55     setNFreePG(&buckets, bkey, bucket);
    56   }
    57 
    58   int bucketCmp(const void *a, const void *b) {
    59     smallArrayt *A = (smallArrayt*)a;
    60     smallArrayt *B = (smallArrayt*)b;
    61     size_t la,lb;
    62     if (!A or !isOSmallArray(A)) la = 0; else la = lenG(A);
    63     if (!B or !isOSmallArray(B)) lb = 0; else lb = lenG(B);
    64     ret lb - la;
    65   }
    66 
    67   sortFG(&buckets, bucketCmp);
    68 
    69   staticBitsetT(usedSlotst, u64 , size);
    70   usedSlotst usedSlots;
    71   staticBitsetClear(&usedSlots);
    72 
    73   size_t b;
    74   for(b = 0; b < size; b++) {
    75     var bucket = getG(&buckets, rtSmallArrayt, b);
    76     if (!bucket) continue;
    77 
    78     if(lenG(bucket) <= 1) { finishG(bucket); break;}
    79 
    80     u32 d = 1; size_t item = 0; u32 slots[size]; u32 slot; bool used[size];
    81     indexer slotsi;
    82     indexerInit(slotsi, size);
    83     range(i, size) used[i] = false;
    84 
    85     // Repeatedly try different values of d until we find a hash function
    86     // that places all items in the bucket into free slots
    87     while(item < lenG(bucket)) {
    88       slot = hash(d, getG(bucket, rtChar, item)) % size;
    89       // need hasObjectG
    90       if(staticBitsetGet(&usedSlots, slot) || used[slot]) {
    91         d++;
    92         item = 0;
    93         indexerInit(slotsi, size);
    94         range(i, size) used[i] = false;
    95       } else {
    96         used[slot] = true;
    97         indexerPush(slotsi);
    98         slots[slotsi.last] = slot;
    99         item++;
   100       }
   101     }
   102 
   103     setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, d);
   104     range(i, lenG(bucket)) {
   105       staticBitset1(&usedSlots, slots[i]);
   106     }
   107     setNFreePG(&buckets, b, bucket);
   108   }
   109 
   110   // Only buckets with 1 item remain. Process them more quickly by directly
   111   // placing them into a free slot. Use a negative value of d to indicate
   112   // this.
   113 
   114   createSmallArray(freelist);
   115   range(i, size) {
   116     if (!staticBitsetGet(&usedSlots, i)) {
   117       pushG(&freelist, i);
   118     }
   119   }
   120 
   121   for(; b < size; b++ ) {
   122     var bucket = getG(&buckets, rtSmallArrayt, b);
   123     if (!bucket) break;
   124     var slot = popG(&freelist, rtU32);
   125 
   126     // We subtract one to ensure it's negative even if the zeroeth slot was used.
   127     setG(res.G, hash(0, getG(bucket, rtChar, 0)) % size, 0-slot-1);
   128     setNFreePG(&buckets, b, bucket);
   129   }
   130 
   131   freeG(&freelist);
   132   freeG(&buckets);
   133 
   134   ret res;
   135 }
   136 
   137 u32 lookup(hanovt *GV, const char *key) {
   138   var d = getG(GV->G, rtI32, hash(0, key) % lenG(GV->G));
   139   if (d < 0) ret 0-d-1;
   140   ret hash(d, key) % GV->size;
   141 }
   142 
   143 void freeHanov(hanovt *h) {
   144   terminateG(h->G);
   145 }
   146 
   147 int main(int ARGC, char** ARGV) {
   148 
   149   initLibsheepy(ARGV[0]);
   150   setLogMode(LOG_CONCISE);
   151   setLogSymbols(LOG_UTF8);
   152 
   153   char **vector = NULL;
   154 
   155   if (ARGC < 2) {
   156     logE("key list missing");
   157     logP("\n"BLD GRN"Usage:"RST);
   158     puts(BLD "genHanov keyFileList\n\n" RST
   159          "Optional configuration file: "CFG_FILENAME"\n"
   160          "(located in directory where genHanov runs)\n\n"
   161          "---\n"
   162          "  type: hanovt\n"
   163          "  perfectHashData: perfectHash\n"
   164          "  keysArray: keys\n"
   165          "  funcScope: static inline\n"
   166          "  hashFunc: hanovHash\n"
   167          "  lookupFunc: hanovLookup\n"
   168          "  findFunc:  hanovFind"
   169          );
   170     XFAILURE;
   171   }
   172 
   173   vector = readFileG(&vector, ARGV[1]);
   174 
   175   if (!vector) {
   176     logE("Couldn't key list file "BLD RED"'%s'"RST, ARGV[1]);
   177     XFAILURE;
   178   }
   179 
   180   var han = create((const char**)vector);
   181 
   182   // load configuration
   183   createSmallJson(cfg);
   184   if (fileExists(CFG_FILENAME)) {
   185     readFileG(&cfg, CFG_FILENAME);
   186     type            = (const char *) getG(&cfg, rtChar, "type");
   187     perfectHashData = (const char *) getG(&cfg, rtChar, "perfectHashData");
   188     keysArray       = (const char *) getG(&cfg, rtChar, "keysArray");
   189     funcScope       = (const char *) getG(&cfg, rtChar, "funcScope");
   190     hashFunc        = (const char *) getG(&cfg, rtChar, "hashFunc");
   191     lookupFunc      = (const char *) getG(&cfg, rtChar, "lookupFunc");
   192     findFunc        = (const char *) getG(&cfg, rtChar, "findFunc");
   193   }
   194 
   195   // print minimal perfect hash function C code
   196 
   197   char *dataDecl = formatS("const %s %s = {{", type, perfectHashData);
   198   var ts = getCurrentDateYMD();
   199 
   200   printf("// generated by genHanov.c from perfecthash sheepy package at %s\n\n"
   201          "#include "_"libsheepyObject.h"_"\n"
   202          "#include "_"shpPackages/crc/crc.h"_"\n\n"
   203          "typ struct { u32 G[%zu]; u32 size;} %s;\n\n"
   204          "%s",
   205          ts,
   206          lenG(han.G), type,
   207          dataDecl);
   208   free(ts);
   209 
   210   size_t dataDeclLen = lenG(dataDecl) -1;
   211   free(dataDecl);
   212 
   213   char *comma[] = {"", ","};
   214 
   215   indexer cols;
   216   indexerInit(cols, colCount);
   217 
   218   iter(han.G, D) {
   219     cast(smallIntt*,d,D);
   220     if (isOSmallInt(D)) {
   221       printf("%s%u", comma[iterIndexG(han.G) ? 1 : 0], getValG(d));
   222     }
   223     else printf("%s0", comma[iterIndexG(han.G) ? 1 : 0]);
   224     indexerPush(cols);
   225     if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' ');
   226   }
   227 
   228   printf("}, %u};\n\n", han.size);
   229 
   230   staticBitsetT(usedSlotst, u64 , lenG(vector));
   231   usedSlotst usedSlots;
   232   staticBitsetClear(&usedSlots);
   233 
   234   char *keys[han.size];
   235 
   236   forEachS(vector, key) {
   237     var o = lookup(&han, key);
   238     if (staticBitsetGet(&usedSlots, o)) {
   239       logE("Collision: hash %u", o);
   240       XFAILURE;
   241     }
   242     keys[o] = key;
   243     /* logVarG(key); */
   244     /* logVarG(o); */
   245     staticBitset1(&usedSlots, o);
   246   }
   247 
   248   char *keysDecl = formatS("const char *%s[%u] = {", keysArray, han.size);
   249   printf("%s", keysDecl);
   250 
   251   dataDeclLen = lenG(keysDecl) -1;
   252   free(keysDecl);
   253 
   254   indexerInit(cols, colCount);
   255 
   256   range(i, han.size) {
   257     printf("%s"_"%s"_, comma[i ? 1 : 0], keys[i]);
   258     indexerPush(cols);
   259     if (indexerLast(cols) == cols.maxCount -1) printf("\n%*c",dataDeclLen,' ');
   260   }
   261 
   262   printf("};\n\n");
   263 
   264   printf("%s u32 %s(u32 d, const char *str) {\n"
   265          "  if (!d) d = 0x811c9dc5;\n"
   266          "  ret crc32_Buf(d, (void*)str, lenG(str)) & 0x7fffffff;\n"
   267          "}\n\n"
   268          "%s u32 %s(const %s *GV, const char *key) {\n"
   269          "  var d = (i32)GV->G[%s(0, key) %% ARRAY_SIZE(GV->G)];\n"
   270          "  if (d < 0) ret 0-d-1;\n"
   271          "  ret %s(d, key) %% GV->size;\n"
   272          "}\n\n"
   273          "// lookup and return -1 when key is not found\n"
   274          "%s ssize_t %s(const %s *GV, const char *key) {\n"
   275          "  var idx = %s(GV, key);\n\n"
   276          "  if (eqG(key, %s[idx])) ret idx;\n"
   277          "  else ret -1; // key is not found\n"
   278          "}\n\n"
   279          "// Loopup:   u32 idx = %s(&%s, "_"key"_");\n"
   280          "// Find key: u32 idx = %s(&%s, "_"key"_");",
   281          funcScope, hashFunc,
   282          funcScope, lookupFunc, type,
   283          hashFunc,
   284          hashFunc,
   285          funcScope, findFunc, type,
   286          lookupFunc,
   287          keysArray,
   288          lookupFunc, perfectHashData,
   289          findFunc, perfectHashData);
   290 
   291   freeG(&cfg);
   292   freeG(vector);
   293   freeHanov(&han);
   294 }
   295 // vim: set expandtab ts=2 sw=2: