Skip to content

Commit 66d307b

Browse files
committed
ADDED: Add stepwise Aho-Corasick search
1 parent 2082284 commit 66d307b

2 files changed

Lines changed: 108 additions & 7 deletions

File tree

src/aho-corasick.c

Lines changed: 104 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161

6262
/// Node used to build Aho-Corasick search trie
6363
typedef struct {
64-
char c;
64+
unsigned char c;
6565
unsigned char type;
6666
int len;
6767

@@ -490,25 +490,29 @@ int options[] = {
490490
char * haystack[] = {
491491
"footbally",
492492
"ufootbally",
493-
"footbal"
493+
"footbal",
494+
"föot"
494495
};
495496

496497

497-
char * results[3][3] = {
498+
char * results[3][4] = {
498499
{
499500
"foot\notb\nfootball\nball\nally\n",
500501
"ufo\nfoot\notb\nfootball\nball\nally\n",
501-
"foot\notb\n"
502+
"foot\notb\n",
503+
"föot\n"
502504
},
503505
{
504506
"foot\nfootball\n",
505507
"ufo\notb\nally\n",
506-
"foot\n"
508+
"foot\n",
509+
"föot\n"
507510
},
508511
{
509512
"football\n",
510513
"ufo\notb\nally\n",
511-
"foot\n"
514+
"foot\n",
515+
"föot\n"
512516
}
513517
};
514518

@@ -559,6 +563,7 @@ void Test_ac_search(CuTest * tc) {
559563
ac_insert(a, "ally", 39);
560564
ac_insert(a, "ufo", 38);
561565
ac_insert(a, "otb", 37);
566+
ac_insert(a, "föot", 69);
562567

563568
F(i, (int) (sizeof(options) / sizeof(options[0]))) {
564569
// Prepare AC trie with new options
@@ -586,6 +591,99 @@ void Test_ac_search(CuTest * tc) {
586591
#endif
587592

588593

594+
/// Monitor one character at a time for matches
595+
size_t ac_step(size_t s, ac * a, int options, unsigned char c, size_t * len, unsigned char * type) {
596+
*len = -1;
597+
*type = '\0';
598+
599+
// Check for path that allows us to match next character
600+
while (s && a->node[s].child[c] == 0) {
601+
s = a->node[s].ac_fail;
602+
}
603+
604+
// Accept next character
605+
s = a->node[s].child[c];
606+
607+
// Do we have a match
608+
size_t temp_s = s;
609+
610+
while (temp_s) {
611+
if (a->node[temp_s].type) {
612+
// This is a match
613+
if (*len != -1) {
614+
if (options & AC_LONGEST) {
615+
// Is this longer than the current match?
616+
if (*len == a->node[temp_s].len) {
617+
// Update existing match
618+
*len = a->node[temp_s].len;
619+
*type = a->node[temp_s].type;
620+
} else {
621+
// Ignore this match
622+
}
623+
} else {
624+
// Ignore this match
625+
}
626+
} else {
627+
*len = a->node[temp_s].len;
628+
*type = a->node[temp_s].type;
629+
}
630+
}
631+
632+
temp_s = a->node[temp_s].ac_fail;
633+
}
634+
635+
return s;
636+
}
637+
638+
639+
#ifdef TEST
640+
641+
unsigned char step_result[3][6] = {
642+
{ 0, 0, 42, 0, 0, 44 },
643+
{ 0, 0, 42, 0, 0, 44 },
644+
{ 0, 0, 42, 0, 0, 44 }
645+
};
646+
647+
void Test_ac_step(CuTest * tc) {
648+
ac * a = ac_new(0);
649+
650+
ac_insert(a, "foo", 42);
651+
ac_insert(a, "bar", 43);
652+
ac_insert(a, "foobar", 44);
653+
654+
char * haystack = "foobar";
655+
656+
F(i, (int) (sizeof(options) / sizeof(options[0]))) {
657+
ac_prepare(a, options[i]);
658+
size_t s = 0;
659+
size_t len = 0;
660+
unsigned char type = 0;
661+
662+
F(j, (int) sizeof(haystack)) {
663+
s = ac_step(s, a, options[i], (unsigned char) haystack[j], &len, &type);
664+
CuAssertIntEquals(tc, step_result[i][j], type);
665+
switch(type) {
666+
case 0:
667+
break;
668+
case 42:
669+
CuAssertIntEquals(tc, 3, (int) len);
670+
break;
671+
case 43:
672+
CuAssertIntEquals(tc, 3, (int) len);
673+
break;
674+
case 44:
675+
CuAssertIntEquals(tc, 6, (int) len);
676+
break;
677+
}
678+
}
679+
}
680+
681+
ac_free(a);
682+
}
683+
684+
#endif
685+
686+
589687
static void trie_node_to_graphviz(ac * a, size_t s, FILE * out) {
590688
trie_node * n = &a->node[s];
591689

@@ -616,4 +714,3 @@ void ac_to_graphviz(ac * a, FILE * out) {
616714

617715
fprintf(out, "}\n");
618716
}
619-

src/aho-corasick.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,10 @@ void ac_prepare(ac * a, int options);
9191
match * ac_search(ac * a, int options, const unsigned char * source, size_t start, size_t len);
9292

9393

94+
/// Monitor one character at a time for matches
95+
size_t ac_step(size_t s, ac * a, int options, unsigned char c, size_t * len, unsigned char * type);
96+
97+
9498
/// Free linked list of matches
9599
void match_free(match * m);
96100

0 commit comments

Comments
 (0)