超快的通配符搜索

作者: 分类: 原创 时间: 2013-06-15 评论: 2条评论

基于BNDM算法,超快速搜索。搜索一个1G大小文件的中央部分数据,仅需400ms左右。

/*
    调用方式

    #include <stdio.h>

    #include "memsearch.h"

    int main()
    {
        char s[] = "01234567890123456789123455678";
        char p[] = "39 ?? ?? 33";
        printf("%d", memsearch(s, strlen(s), p) );         // -> 19
        return 0;
    }


    memsearch 返回值:
    -2 模式串p错误
    -1 没找到结果
    >=0 找到的偏移位置
    
    注意,模式串不能超过32个字节,多余部分会被舍弃
*/

// memsearch.h
#include <malloc.h>
#include <string.h>

#define     INVALID_CHAR    (unsigned char)-1
#define     WILDCARD        (unsigned short)0x100

static unsigned char GetHexValue(char ch)
{
    if (ch >= '0' && ch <= '9') return ch - '0';
    if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10;
    if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10;
    return INVALID_CHAR;
}

static unsigned short* GetPatternHex(const char *src, size_t *out_len)
{
    unsigned short* pattern = (unsigned short*)_strdup(src); // strdup已经能够保证足够存放
    unsigned short* dst = pattern;

    int waiting_count;
    int waiting_hex = 0;
    int waiting_question = 0;
    //int waiting_asterisk = 0;

    while(*src)
    {
        if(*src==' ')
        {
            src++;
            continue;
        }

        //同一时刻最多有一个等待
        waiting_count = 0;
        if(waiting_hex) waiting_count++;
        if(waiting_question) waiting_count++;
        //if(waiting_asterisk) waiting_count++;
        if(waiting_count>1) goto error;

        switch(*src)
        {
            case '?':
                if(waiting_question)
                {
                    waiting_question = 0;

                    *dst++ = WILDCARD;
                }
                else
                {
                    waiting_question = 1;
                }
                break;
            /*
            case '*':
                if(waiting_asterisk)
                {
                    waiting_asterisk = 0;

                    *dst++ = WILDCARD + 1;
                }
                else
                {
                    waiting_asterisk = 1;
                }
                break;
            */
            default:
                {
                    unsigned char ch = GetHexValue(*src);
                    if(ch==INVALID_CHAR)
                    {
                        goto error;
                    }
                    else
                    {
                        if(waiting_hex)
                        {
                            waiting_hex = 0;

                            *dst++ += ch;
                        }
                        else
                        {
                            waiting_hex = 1;

                            *dst = ch << 4;
                        }
                    }
                }
        }

        src++;
    }

    if(waiting_hex) goto error;
    if(waiting_question) goto error;
    //if(waiting_asterisk) goto error;

    *out_len = dst - pattern;
    return pattern;

error:
    free(pattern);
    *out_len = 0;
    return NULL;
}


#ifndef min
    #define min(a,b) ((a)<(b))?(a):(b)
#endif

int BNDMWildcards32(const void* src, size_t n, const void* sub, size_t m)
{
    unsigned char* s = (unsigned char*)src;
    unsigned short* p = (unsigned short*)sub;

    size_t i = 0;
    unsigned long pos = 0;
    unsigned long mask = 0;
    unsigned long b[256] = {0};
    
    m = min(m, 32);
    
    // 参数不合法
    if ( !s || !p || n < m || m == 0 )
        return -1;

    // 处理通配符
    for (i = 0; i < m; i++)
    {
        if (p[i] == WILDCARD)
        {
            mask |= 1 << (m - 1 - i);
        }
    }
    if (mask != 0)
    {
        for(i = 0; i < 256; i++)
        {
            b[i] = mask;
        }
    }

    // 预处理
    for (i = 0; i < m; i++)
    {
        if (p[i] != WILDCARD)
        {
           b[ p[i] ] |= 1 << (m - 1 - i);
        }
    }

    // BNDM 搜索
    while (pos <= n - m)
    {
        int j = m - 1;
        int last = m;
        unsigned int D = ~0;
        while (D)
        {
            D &= b[s[pos + j]];
            if (D)
            {
                if (j > 0)
                {
                    last = j;
                }
                else
                {
                    return pos;
                }
            }
            j--;
            D <<= 1;
        }
        pos += last;
    }

    return -1;
}

int memsearch(const void* src, int n, const char* pattern)
{
    int result = -2;
    // 预处理
    size_t m = 0;
    unsigned short* p = GetPatternHex(pattern, &m);

    if( !p || !m ) return result;

    // 搜索
    result = BNDMWildcards32(src, n, p, m);

    free(p);
    return result;
}
标签: none

已有 2 条评论

  1. 小小ZH
    小小ZH

    讓我頭香吧XD

    时间: 2013-06-16 01:27
  2. 「有事燒紙」
    「有事燒紙」

    icon_biggrin.gif

    时间: 2015-10-10 15:12

评论已关闭