正则表达式解析文本
最近碰到这么个问题,一个文本,每行都是乱糟糟的东西,要从里面解析出东西来。行匹配铁定是用正则表达式,我用了Boost,不会的看我前两天的blog去。
下面是按行解析问题。简单来说,写一个类继承Lister,然后实现里面的三个纯虚方法。maxSeqence返回最大可以支持的表达式,registe_regex返回表达式文本,seqenceProcess返回相应函数的指针。其实可以写成直接调用seqenceProcess加上匹配序号,然后让用户在函数内部做switch-case的。不过这样用户代码量稍微有点多,所以干脆玩一把技术。然后是几个非纯虚函数,nextSeqence可以根据当前状态来控制下一个要匹配的表达式,默认是+1,一个一个全部匹配。beforeProcess和afterProcess分别是处理前后,可以调整输入流。noMatch是一个比较常用的虚函数,用于响应没有匹配时的状态。
匹配的结果在cmatch & what中,详细请看boost::regex。不过what[0].str()可以获得整句的string型返回,what[1]开始就是正则的匹配结果。
———2006-12-25—————
原来的结果删除,我重写了一个。
主要有两个问题,一个是getline的效率问题,我会撰文说明的。还有就是两处细节不大好。
为了修正这两个问题,我突然发现整个的构架不大好了——怎么办?重写吧——
下面是新的,一个类line_regex,直接继承就好。line_buffer是用于解决getline效率不高的问题的,当然我偷了个懒,实现代码用了WIN32API,所以是不可移植的。而且数据是一次读取,最多256M。不过相信这种级别的问题还难不倒大家。line_buffer类中的函数都很清晰明了,就不介绍了。
Process (tistream & is)和Process (line_buffer & lb)是两大入口,同时支持自有的输入方法和流输入。当然流输入清晰明了标准化程度高。不过效率差的一塌糊涂。继承类初始化的时候,记得设置pfTable为入口列表,然后调用注册函数完成注册。nextSeqence和上面一样,可以定制下一个匹配式。noMatch用于无匹配的时候。beforeProcess和afterProcess分别会在某行开始和结束匹配后用,返回-1结束运行。其中beforeProcess返回正数会导致本行跳过,可以作为过滤器。
---------------------LineRegex.h--------------------
#include
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace boost;
typedef basic_stringtstring;
typedef basic_regextregex;
typedef match_resultstmatch;
typedef basic_istream >tistream;
#ifndef_LINE_REGEX_H_
#define_LINE_REGEX_H_
class line_regex;
typedef int (line_regex::*ProcessFunction) (const tmatch & what,
int line);
class line_buffer {
public:
line_buffer();
~line_buffer();
intopen(LPCTSTR lpPath);
voidclose();
LPTSTRgetline();
longsize();
protected:
UINTFileSize;
LPVOIDlpFile;
TCHAR *lpNow, *lpNext;
};
class line_regex {
public:
line_regex();
~line_regex();
virtualintnextSeqence(int seqence);
virtual intnoMatch(LPTSTR strLine, int line);
virtual int beforeProcess (LPTSTR strLine, int line);
virtual int afterProcess (LPTSTR strLine, int line);
voidregiste_expression(LPCTSTRexps[]);
int Process (tistream & is);
int Process (line_buffer & lb);
protected:
intProcessLine(LPTSTR strLine, int line);
longmaxSeqence;
ProcessFunction*pfTable;
tregex*expressions;
tmatchwhat;
};
#endif//_LINE_REGEX_H_
----------------------------------------------------
----------------------LineRegex.h-------------------
#include "stdafx.h"
line_buffer::line_buffer ()
{
lpFile = NULL;
}
line_buffer::~line_buffer ()
{
close ();
}
int line_buffer::open (LPCTSTR lpPath)
{
HANDLE hFile;
DWORD dwBytes;
__try {
hFile = CreateFile (lpPath, GENERIC_READ, FILE_SHARE_READ, NULL,
OPEN_ALWAYS, 0, NULL);
if (INVALID_HANDLE_VALUE == hFile)
return -1;
FileSize = GetFileSize (hFile, NULL);
if (FileSize > 0x10000000)
return -1;
lpFile = new BYTE[FileSize];
if (lpFile == NULL)
return -1;
lpNext = (TCHAR *) lpFile;
if (ReadFile (hFile, lpFile, FileSize, &dwBytes, NULL) < 0)
return -1;
}
__finally {
CloseHandle (hFile);
}
return 0;
}
void line_buffer::close ()
{
if (lpFile != NULL)
delete lpFile;
lpFile = NULL;
return;
}
LPTSTR line_buffer::getline ()
{
lpNow = lpNext;
if (lpNow == NULL)
return NULL;
while ((lpNext - (TCHAR *) lpFile) * sizeof (TCHAR) < FileSize) {
if (*lpNext == _T ('n')) {
*lpNext = _T ('');
lpNext++;
return lpNow;
}
if ((*lpNext == _T ('r')) && (*(lpNext + 1) == _T ('n'))) {
*lpNext = _T ('');
lpNext += 2;
return lpNow;
}
lpNext++;
}
lpNext = NULL;
return lpNow;
}
long line_buffer::size ()
{
return 0;
}
line_regex::line_regex ()
{
maxSeqence = 0;
pfTable = NULL;
expressions = NULL;
}
line_regex::~line_regex ()
{
maxSeqence = 0;
if (pfTable != NULL)
pfTable = NULL;
if (expressions != NULL) {
delete[]expressions;
expressions = NULL;
}
}
int line_regex::nextSeqence (int seqence)
{
return seqence + 1;
}
int line_regex::noMatch (LPTSTR strLine, int line)
{
return 0;
}
int line_regex::beforeProcess (LPTSTR strLine, int line)
{
return 0;
}
int line_regex::afterProcess (LPTSTR strLine, int line)
{
return 0;
}
void line_regex::registe_expression (LPCTSTR exps[])
{
int i;
try {
if (expressions != NULL) {
delete[]expressions;
expressions = NULL;
}
for (i = 0; exps[i]; i++);
maxSeqence = i;
expressions = new tregex[maxSeqence];
for (i = 0; i < maxSeqence; i++)
expressions[i].assign (exps[i]);
}
catch (std::exception & e) {
cout << "Error in expression: "" << e.what () << """
<< endl;
}
return;
}
int line_regex::Process (tistream & is)
{
int rslt, line = 0;
tstring str;
try {
while (getline (is, str)) {
line++;
rslt = beforeProcess ((LPTSTR) str.c_str (), line);
if (rslt < 0)
return rslt;
if (rslt < 0)
continue;
rslt = ProcessLine ((LPTSTR) str.c_str (), line);
if (rslt < 0)
return rslt;
rslt = afterProcess ((LPTSTR) str.c_str (), line);
if (rslt < 0)
return rslt;
}
}
catch (std::exception & e) {
cout << "Error in expression: "" << e.what () << """
<< endl;
}
return 0;
}
int line_regex::Process (line_buffer & lb)
{
int rslt, line = 0;
tstring str;
LPTSTR lpBuffer;
try {
while (lpBuffer = lb.getline ()) {
line++;
rslt = beforeProcess (lpBuffer, line);
if (rslt < 0)
return rslt;
if (rslt < 0)
continue;
rslt = ProcessLine (lpBuffer, line);
if (rslt < 0)
return rslt;
rslt = afterProcess (lpBuffer, line);
if (rslt < 0)
return rslt;
}
}
catch (std::exception & e) {
cout << "Error in expression: "" << e.what () << """
<< endl;
}
return 0;
}
int line_regex::ProcessLine (LPTSTR strLine, int line)
{
int i, rslt;
for (i = 0; i < maxSeqence; i = nextSeqence (i)) {
if (regex_search (strLine, what, expressions[i])) {
rslt = (this->*(pfTable[i])) (what, line);
if (rslt < 0)
return rslt;
#ifdef _DEBUG_OUT_
// cout << line << ":match" << i <<
endl;
#endif //_DEBUG_OUT_
break;
}
}
if (i == maxSeqence) {
noMatch (strLine, line);
#ifdef _DEBUG_OUT_
// cout << line << ":no match" << endl;
#endif //_DEBUG_OUT_
}//*/
return 0;
}
----------------------------------------------------
给个例子:
PsParser::PsParser ()
{
static LPCTSTR regex_exp[] = {
_T ("^/RsvMt matrix currentmatrix def [\\[[0-9\]().]*[0-9\.]* [0-9\.]* [0-9\.]* ([0-9\.]*) ([0-9\.]*)\\]concat$"),
_T ("^.*-([0-9]*)X([0-9]*) setfont$"),
//_T ("^(.*) setfont$"),//
_T ("^([0-9]*) ([0-9]*) m$"),
_T ("^\\((.*)\\) [\\[(.*)\]()] 2 (?:fxs|fys|VT)$"),
_T ("^%%DownLoadCode (.*)$"),
_T ("^RsvMt setmatrix gr$"),
_T ("^([0-9\.]*) ([0-9\.]*) ([0-9\.]*) ([0-9\.]*)
Rect$"),
_T ("^\\((.*)\\) [\\S]().* file bOpenFile\\{ closefile $"),
_T ("^newpath ([0-9\.]*) ([0-9\.]*) moveto $"),
_T ("^ ([0-9\.]*) ([0-9\.]*) lineto$"),
NULL
};
static ProcessFunction pfTableStatic[] = {
(ProcessFunction) text_start_block,
(ProcessFunction) text_setfont,
(ProcessFunction) text_movement,
(ProcessFunction) text_out,
(ProcessFunction) text_download_code,
(ProcessFunction) text_end_block,
(ProcessFunction) pic_text,
(ProcessFunction) pic_out,
(ProcessFunction) line_start,
(ProcessFunction) line_to
};
this->registe_expression ((LPCTSTR *) regex_exp);
pfTable = pfTableStatic;
return;
}
int _tmain (int argc, _TCHAR * argv[])
{
line_buffer lb;
PsParser p;
lb.open (argv[1]);
p.Process (lb);
lb.close ();
printf ("%dn", clock ());
return 0;
}