Shell's Home

Dec 21, 2006 - 4 minute read - Comments

正则表达式解析文本

最近碰到这么个问题,一个文本,每行都是乱糟糟的东西,要从里面解析出东西来。行匹配铁定是用正则表达式,我用了Boost,不会的看我前两天的blog去。

下面是按行解析问题。简单来说,写一个类继承Lister,然后实现里面的三个纯虚方法。maxSeqence返回最大可以支持的表达式,registe_regex返回表达式文本,seqenceProcess返回相应函数的指针。其实可以写成直接调用seqenceProcess加上匹配序号,然后让用户在函数内部做switch-case的。不过这样用户代码量稍微有点多,所以干脆玩一把技术。然后是几个非纯虚函数,nextSeqence可以根据当前状态来控制下一个要匹配的表达式,默认是+1,一个一个全部匹配。beforeProcess和afterProcess分别是处理前后,可以调整输入流。noMatch是一个比较常用的虚函数,用于响应没有匹配时的状态。

匹配的结果在cmatch & what中,详细请看boost::regex。不过what[0].str()可以获得整句的string型返回,what[1]开始就是正则的匹配结果。

———2006-12-25—————

原来的结果删除,我重写了一个。

主要有两个问题,一个是getline的效率问题,我会撰文说明的。还有就是两处细节不大好。

为了修正这两个问题,我突然发现整个的构架不大好了——怎么办?重写吧——

下面是新的,一个类line_regex,直接继承就好。line_buffer是用于解决getline效率不高的问题的,当然我偷了个懒,实现代码用了WIN32API,所以是不可移植的。而且数据是一次读取,最多256M。不过相信这种级别的问题还难不倒大家。line_buffer类中的函数都很清晰明了,就不介绍了。

Process (tistream & is)和Process (line_buffer & lb)是两大入口,同时支持自有的输入方法和流输入。当然流输入清晰明了标准化程度高。不过效率差的一塌糊涂。继承类初始化的时候,记得设置pfTable为入口列表,然后调用注册函数完成注册。nextSeqence和上面一样,可以定制下一个匹配式。noMatch用于无匹配的时候。beforeProcess和afterProcess分别会在某行开始和结束匹配后用,返回-1结束运行。其中beforeProcess返回正数会导致本行跳过,可以作为过滤器。

---------------------LineRegex.h--------------------

#include
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace boost;

typedef basic_stringtstring;
typedef basic_regextregex;
typedef match_resultstmatch;
typedef basic_istream >tistream;

#ifndef_LINE_REGEX_H_
#define_LINE_REGEX_H_

class line_regex;
typedef int (line_regex::*ProcessFunction) (const tmatch & what,
                        int line);

class line_buffer {
public:
    line_buffer();
    ~line_buffer();
    intopen(LPCTSTR lpPath);
    voidclose();
    LPTSTRgetline();
    longsize();
protected:
    UINTFileSize;
    LPVOIDlpFile;
    TCHAR *lpNow, *lpNext;
};

class line_regex {
public:
    line_regex();
    ~line_regex();
    virtualintnextSeqence(int seqence);
    virtual intnoMatch(LPTSTR strLine, int line);
    virtual int beforeProcess (LPTSTR strLine, int line);
    virtual int afterProcess (LPTSTR strLine, int line);
    voidregiste_expression(LPCTSTRexps[]);
    int Process (tistream & is);
    int Process (line_buffer & lb);
protected:
    intProcessLine(LPTSTR strLine, int line);
    longmaxSeqence;
    ProcessFunction*pfTable;
    tregex*expressions;
    tmatchwhat;
};

#endif//_LINE_REGEX_H_

----------------------------------------------------

----------------------LineRegex.h-------------------

#include "stdafx.h"

line_buffer::line_buffer ()
{
    lpFile = NULL;
}

line_buffer::~line_buffer ()
{
    close ();
}

int line_buffer::open (LPCTSTR lpPath)
{
    HANDLE hFile;
    DWORD dwBytes;
    __try {
        hFile = CreateFile (lpPath, GENERIC_READ, FILE_SHARE_READ, NULL,
                    OPEN_ALWAYS, 0, NULL);
        if (INVALID_HANDLE_VALUE == hFile)
            return -1;
        FileSize = GetFileSize (hFile, NULL);
        if (FileSize > 0x10000000)
            return -1;
        lpFile = new BYTE[FileSize];
        if (lpFile == NULL)
            return -1;
        lpNext = (TCHAR *) lpFile;
        if (ReadFile (hFile, lpFile, FileSize, &dwBytes, NULL) < 0)
            return -1;
    }
    __finally {
        CloseHandle (hFile);
    }
    return 0;
}

void line_buffer::close ()
{
    if (lpFile != NULL)
        delete lpFile;
    lpFile = NULL;
    return;
}

LPTSTR line_buffer::getline ()
{
    lpNow = lpNext;
    if (lpNow == NULL)
        return NULL;
    while ((lpNext - (TCHAR *) lpFile) * sizeof (TCHAR) < FileSize) {
        if (*lpNext == _T ('n')) {
            *lpNext = _T ('');
            lpNext++;
            return lpNow;
        }
        if ((*lpNext == _T ('r')) && (*(lpNext + 1) == _T ('n'))) {
            *lpNext = _T ('');
            lpNext += 2;
            return lpNow;
        }
        lpNext++;
    }
    lpNext = NULL;
    return lpNow;
}

long line_buffer::size ()
{
    return 0;
}

line_regex::line_regex ()
{
    maxSeqence = 0;
    pfTable = NULL;
    expressions = NULL;
}

line_regex::~line_regex ()
{
    maxSeqence = 0;
    if (pfTable != NULL)
        pfTable = NULL;
    if (expressions != NULL) {
        delete[]expressions;
        expressions = NULL;
    }
}

int line_regex::nextSeqence (int seqence)
{
    return seqence + 1;
}

int line_regex::noMatch (LPTSTR strLine, int line)
{
    return 0;
}

int line_regex::beforeProcess (LPTSTR strLine, int line)
{
    return 0;
}

int line_regex::afterProcess (LPTSTR strLine, int line)
{
    return 0;
}

void line_regex::registe_expression (LPCTSTR exps[])
{
    int i;
    try {
        if (expressions != NULL) {
            delete[]expressions;
            expressions = NULL;
        }
        for (i = 0; exps[i]; i++);
        maxSeqence = i;
        expressions = new tregex[maxSeqence];
        for (i = 0; i < maxSeqence; i++)
            expressions[i].assign (exps[i]);
    }
    catch (std::exception & e) {
        cout << "Error in expression: "" << e.what () << """
             << endl;
    }
    return;
}

int line_regex::Process (tistream & is)
{
    int rslt, line = 0;
    tstring str;
    try {
        while (getline (is, str)) {
            line++;
            rslt = beforeProcess ((LPTSTR) str.c_str (), line);
            if (rslt < 0)
                return rslt;
            if (rslt < 0)
                continue;
            rslt = ProcessLine ((LPTSTR) str.c_str (), line);
            if (rslt < 0)
                return rslt;
            rslt = afterProcess ((LPTSTR) str.c_str (), line);
            if (rslt < 0)
                return rslt;
        }
    }
    catch (std::exception & e) {
        cout << "Error in expression: "" << e.what () << """
             << endl;
    }
    return 0;
}

int line_regex::Process (line_buffer & lb)
{
    int rslt, line = 0;
    tstring str;
    LPTSTR lpBuffer;
    try {
        while (lpBuffer = lb.getline ()) {
            line++;
            rslt = beforeProcess (lpBuffer, line);
            if (rslt < 0)
                return rslt;
            if (rslt < 0)
                continue;
            rslt = ProcessLine (lpBuffer, line);
            if (rslt < 0)
                return rslt;
            rslt = afterProcess (lpBuffer, line);
            if (rslt < 0)
                return rslt;
        }
    }
    catch (std::exception & e) {
        cout << "Error in expression: "" << e.what () << """
             << endl;
    }
    return 0;
}

int line_regex::ProcessLine (LPTSTR strLine, int line)
{
    int i, rslt;
    for (i = 0; i < maxSeqence; i = nextSeqence (i)) {
        if (regex_search (strLine, what, expressions[i])) {
            rslt = (this->*(pfTable[i])) (what, line);
            if (rslt < 0)
                return rslt;
#ifdef _DEBUG_OUT_
// cout << line << ":match" << i <<
            endl;
#endif //_DEBUG_OUT_
            break;
        }
    }
    if (i == maxSeqence) {
        noMatch (strLine, line);
#ifdef _DEBUG_OUT_
// cout << line << ":no match" << endl;
#endif //_DEBUG_OUT_
    }//*/
    return 0;
}

----------------------------------------------------

给个例子:

PsParser::PsParser ()
{
    static LPCTSTR regex_exp[] = {
        _T ("^/RsvMt matrix currentmatrix def [\\[[0-9\]().]*[0-9\.]* [0-9\.]* [0-9\.]* ([0-9\.]*) ([0-9\.]*)\\]concat$"),
        _T ("^.*-([0-9]*)X([0-9]*) setfont$"),
//_T ("^(.*) setfont$"),//
        _T ("^([0-9]*) ([0-9]*) m$"),
        _T ("^\\((.*)\\) [\\[(.*)\]()] 2 (?:fxs|fys|VT)$"),
        _T ("^%%DownLoadCode (.*)$"),
        _T ("^RsvMt setmatrix gr$"),
        _T ("^([0-9\.]*) ([0-9\.]*) ([0-9\.]*) ([0-9\.]*)
Rect$"),
        _T ("^\\((.*)\\) [\\S]().* file bOpenFile\\{ closefile $"),
        _T ("^newpath ([0-9\.]*) ([0-9\.]*) moveto $"),
        _T ("^ ([0-9\.]*) ([0-9\.]*) lineto$"),
        NULL
    };

    static ProcessFunction pfTableStatic[] = {
        (ProcessFunction) text_start_block,
        (ProcessFunction) text_setfont,
        (ProcessFunction) text_movement,
        (ProcessFunction) text_out,
        (ProcessFunction) text_download_code,
        (ProcessFunction) text_end_block,
        (ProcessFunction) pic_text,
        (ProcessFunction) pic_out,
        (ProcessFunction) line_start,
        (ProcessFunction) line_to
    };

    this->registe_expression ((LPCTSTR *) regex_exp);
    pfTable = pfTableStatic;
    return;
}

int _tmain (int argc, _TCHAR * argv[])
{
    line_buffer lb;
    PsParser p;
    lb.open (argv[1]);
    p.Process (lb);
    lb.close ();
    printf ("%dn", clock ());
    return 0;
}