最近碰到这么个问题,一个文本,每行都是乱糟糟的东西,要从里面解析出东西来。行匹配铁定是用正则表达式,我用了Boost,不会的看我前两天的blog去。

下面是按行解析问题。简单来说,写一个类继承Lister,然后实现里面的三个纯虚方法。maxSeqence返回最大可以支持的表达式,registe_regex返回表达式文本,seqenceProcess返回相应函数的指针。其实可以写成直接调用seqenceProcess加上匹配序号,然后让用户在函数内部做switch-case的。不过这样用户代码量稍微有点多,所以干脆玩一把技术。然后是几个非纯虚函数,nextSeqence可以根据当前状态来控制下一个要匹配的表达式,默认是+1,一个一个全部匹配。beforeProcess和afterProcess分别是处理前后,可以调整输入流。noMatch是一个比较常用的虚函数,用于响应没有匹配时的状态。

匹配的结果在cmatch & what中,详细请看boost::regex。不过what[0].str()可以获得整句的string型返回,what[1]开始就是正则的匹配结果。

———2006-12-25—————

原来的结果删除,我重写了一个。

主要有两个问题,一个是getline的效率问题,我会撰文说明的。还有就是两处细节不大好。

为了修正这两个问题,我突然发现整个的构架不大好了——怎么办?重写吧——

下面是新的,一个类line_regex,直接继承就好。line_buffer是用于解决getline效率不高的问题的,当然我偷了个懒,实现代码用了WIN32API,所以是不可移植的。而且数据是一次读取,最多256M。不过相信这种级别的问题还难不倒大家。line_buffer类中的函数都很清晰明了,就不介绍了。

Process (tistream & is)和Process (line_buffer & lb)是两大入口,同时支持自有的输入方法和流输入。当然流输入清晰明了标准化程度高。不过效率差的一塌糊涂。继承类初始化的时候,记得设置pfTable为入口列表,然后调用注册函数完成注册。nextSeqence和上面一样,可以定制下一个匹配式。noMatch用于无匹配的时候。beforeProcess和afterProcess分别会在某行开始和结束匹配后用,返回-1结束运行。其中beforeProcess返回正数会导致本行跳过,可以作为过滤器。

---------------------LineRegex.h--------------------

#include
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace boost;

typedef basic_stringtstring;
typedef basic_regextregex;
typedef match_resultstmatch;
typedef basic_istream >tistream;

#ifndef_LINE_REGEX_H_
#define_LINE_REGEX_H_

class line_regex;
typedef int (line_regex::*ProcessFunction) (const tmatch & what,
						int line);

class line_buffer {
public:
	line_buffer();
	~line_buffer();
	intopen(LPCTSTR lpPath);
	voidclose();
	LPTSTRgetline();
	longsize();
protected:
	UINTFileSize;
	LPVOIDlpFile;
	TCHAR *lpNow, *lpNext;
};

class line_regex {
public:
	line_regex();
	~line_regex();
	virtualintnextSeqence(int seqence);
	virtual intnoMatch(LPTSTR strLine, int line);
	virtual int beforeProcess (LPTSTR strLine, int line);
	virtual int afterProcess (LPTSTR strLine, int line);
	voidregiste_expression(LPCTSTRexps[]);
	int Process (tistream & is);
	int Process (line_buffer & lb);
protected:
	intProcessLine(LPTSTR strLine, int line);
	longmaxSeqence;
	ProcessFunction*pfTable;
	tregex*expressions;
	tmatchwhat;
};

#endif//_LINE_REGEX_H_

----------------------------------------------------

----------------------LineRegex.h-------------------

#include "stdafx.h"

line_buffer::line_buffer ()
{
	lpFile = NULL;
}

line_buffer::~line_buffer ()
{
	close ();
}

int line_buffer::open (LPCTSTR lpPath)
{
	HANDLE hFile;
	DWORD dwBytes;
	__try {
		hFile = CreateFile (lpPath, GENERIC_READ, FILE_SHARE_READ, NULL,
					OPEN_ALWAYS, 0, NULL);
		if (INVALID_HANDLE_VALUE == hFile)
			return -1;
		FileSize = GetFileSize (hFile, NULL);
		if (FileSize > 0x10000000)
			return -1;
		lpFile = new BYTE[FileSize];
		if (lpFile == NULL)
			return -1;
		lpNext = (TCHAR *) lpFile;
		if (ReadFile (hFile, lpFile, FileSize, &dwBytes, NULL) < 0)
			return -1;
	}
	__finally {
		CloseHandle (hFile);
	}
	return 0;
}

void line_buffer::close ()
{
	if (lpFile != NULL)
		delete lpFile;
	lpFile = NULL;
	return;
}

LPTSTR line_buffer::getline ()
{
	lpNow = lpNext;
	if (lpNow == NULL)
		return NULL;
	while ((lpNext - (TCHAR *) lpFile) * sizeof (TCHAR) < FileSize) {
		if (*lpNext == _T ('n')) {
			*lpNext = _T ('');
			lpNext++;
			return lpNow;
		}
		if ((*lpNext == _T ('r')) && (*(lpNext + 1) == _T ('n'))) {
			*lpNext = _T ('');
			lpNext += 2;
			return lpNow;
		}
		lpNext++;
	}
	lpNext = NULL;
	return lpNow;
}

long line_buffer::size ()
{
	return 0;
}

line_regex::line_regex ()
{
	maxSeqence = 0;
	pfTable = NULL;
	expressions = NULL;
}

line_regex::~line_regex ()
{
	maxSeqence = 0;
	if (pfTable != NULL)
		pfTable = NULL;
	if (expressions != NULL) {
		delete[]expressions;
		expressions = NULL;
	}
}

int line_regex::nextSeqence (int seqence)
{
	return seqence + 1;
}

int line_regex::noMatch (LPTSTR strLine, int line)
{
	return 0;
}

int line_regex::beforeProcess (LPTSTR strLine, int line)
{
	return 0;
}

int line_regex::afterProcess (LPTSTR strLine, int line)
{
	return 0;
}

void line_regex::registe_expression (LPCTSTR exps[])
{
	int i;
	try {
		if (expressions != NULL) {
			delete[]expressions;
			expressions = NULL;
		}
		for (i = 0; exps[i]; i++);
		maxSeqence = i;
		expressions = new tregex[maxSeqence];
		for (i = 0; i < maxSeqence; i++)
			expressions[i].assign (exps[i]);
	}
	catch (std::exception & e) {
		cout << "Error in expression: "" << e.what () << """
			 << endl;
	}
	return;
}

int line_regex::Process (tistream & is)
{
	int rslt, line = 0;
	tstring str;
	try {
		while (getline (is, str)) {
			line++;
			rslt = beforeProcess ((LPTSTR) str.c_str (), line);
			if (rslt < 0)
				return rslt;
			if (rslt < 0)
				continue;
			rslt = ProcessLine ((LPTSTR) str.c_str (), line);
			if (rslt < 0)
				return rslt;
			rslt = afterProcess ((LPTSTR) str.c_str (), line);
			if (rslt < 0)
				return rslt;
		}
	}
	catch (std::exception & e) {
		cout << "Error in expression: "" << e.what () << """
			 << endl;
	}
	return 0;
}

int line_regex::Process (line_buffer & lb)
{
	int rslt, line = 0;
	tstring str;
	LPTSTR lpBuffer;
	try {
		while (lpBuffer = lb.getline ()) {
			line++;
			rslt = beforeProcess (lpBuffer, line);
			if (rslt < 0)
				return rslt;
			if (rslt < 0)
				continue;
			rslt = ProcessLine (lpBuffer, line);
			if (rslt < 0)
				return rslt;
			rslt = afterProcess (lpBuffer, line);
			if (rslt < 0)
				return rslt;
		}
	}
	catch (std::exception & e) {
		cout << "Error in expression: "" << e.what () << """
			 << endl;
	}
	return 0;
}

int line_regex::ProcessLine (LPTSTR strLine, int line)
{
	int i, rslt;
	for (i = 0; i < maxSeqence; i = nextSeqence (i)) {
		if (regex_search (strLine, what, expressions[i])) {
			rslt = (this->*(pfTable[i])) (what, line);
			if (rslt < 0)
				return rslt;
#ifdef _DEBUG_OUT_
// cout << line << ":match" << i <<
			endl;
#endif //_DEBUG_OUT_
			break;
		}
	}
	if (i == maxSeqence) {
		noMatch (strLine, line);
#ifdef _DEBUG_OUT_
// cout << line << ":no match" << endl;
#endif //_DEBUG_OUT_
	}//*/
	return 0;
}

----------------------------------------------------

给个例子:

PsParser::PsParser ()
{
	static LPCTSTR regex_exp[] = {
		_T ("^/RsvMt matrix currentmatrix def [\\[[0-9\]().]*[0-9\.]* [0-9\.]* [0-9\.]* ([0-9\.]*) ([0-9\.]*)\\]concat$"),
		_T ("^.*-([0-9]*)X([0-9]*) setfont$"),
//_T ("^(.*) setfont$"),//
		_T ("^([0-9]*) ([0-9]*) m$"),
		_T ("^\\((.*)\\) [\\[(.*)\]()] 2 (?:fxs|fys|VT)$"),
		_T ("^%%DownLoadCode (.*)$"),
		_T ("^RsvMt setmatrix gr$"),
		_T ("^([0-9\.]*) ([0-9\.]*) ([0-9\.]*) ([0-9\.]*)
Rect$"),
		_T ("^\\((.*)\\) [\\S]().* file bOpenFile\\{ closefile $"),
		_T ("^newpath ([0-9\.]*) ([0-9\.]*) moveto $"),
		_T ("^ ([0-9\.]*) ([0-9\.]*) lineto$"),
		NULL
	};

	static ProcessFunction pfTableStatic[] = {
		(ProcessFunction) text_start_block,
		(ProcessFunction) text_setfont,
		(ProcessFunction) text_movement,
		(ProcessFunction) text_out,
		(ProcessFunction) text_download_code,
		(ProcessFunction) text_end_block,
		(ProcessFunction) pic_text,
		(ProcessFunction) pic_out,
		(ProcessFunction) line_start,
		(ProcessFunction) line_to
	};

	this->registe_expression ((LPCTSTR *) regex_exp);
	pfTable = pfTableStatic;
	return;
}

int _tmain (int argc, _TCHAR * argv[])
{
	line_buffer lb;
	PsParser p;
	lb.open (argv[1]);
	p.Process (lb);
	lb.close ();
	printf ("%dn", clock ());
	return 0;
}