libxml是gnome的XML解析库,具有强大的解析能力,支持DOM和SAX解析模型,属于验证型解析器。其内部是使用utf-8编码工作的,因此gbk之类编码的XML无法解析。为了解决这个问题,我们可以使用一个很简单的小窍门。

libxml是要和iconv一并使用的,头文件引用一般类似以下形式。

#include <iconv.h>
#pragma comment(lib, "iconv")
#include <libxml/tree.h>
#include <libxml/parser.h>
#pragma comment(lib, "libxml2")

这样的话,我们向libxml注册一个处理句柄,对其他编码的xml先执行一次转换,再进行解析。

iconv_t iconv_utf8_gbk = NULL;
iconv_t iconv_gbk_utf8 = NULL;

int gbk_input (unsigned char *out, int *outlen, const unsigned char
		   *in,
		   int *inlen)
{
	char *outbuf = (char *) out;
	char *inbuf = (char *) in;
	size_t rslt;
	rslt =	iconv (iconv_utf8_gbk, (const char **) &inbuf, (size_t *)
			   inlen,
			   &outbuf, (size_t *) outlen);
	if (rslt < 0)
		return rslt;
	*outlen = ((unsigned char *) outbuf - out);
	*inlen = ((unsigned char *) inbuf - in);
	return *outlen;
}

int gbk_output (unsigned char *out,
		int *outlen, const unsigned char *in, int *inlen)
{
	char *outbuf = (char *) out;
	char *inbuf = (char *) in;
	size_t rslt;
	rslt =	iconv (iconv_gbk_utf8, (const char **) &inbuf, (size_t *)
			   inlen,
			   &outbuf, (size_t *) outlen);
	if (rslt < 0)
		return rslt;
	*outlen = ((unsigned char *) outbuf - out);
	*inlen = ((unsigned char *) inbuf - in);
	return *outlen;
}

初始化的时候运行以下代码进行句柄注册。

{
	if (iconv_utf8_gbk == NULL)
		iconv_utf8_gbk = iconv_open ("utf-8", "gbk");
	if (iconv_gbk_utf8 == NULL)
		iconv_gbk_utf8 = iconv_open ("gbk", "utf-8");
	LIBXML_TEST_VERSION;
	xmlNewCharEncodingHandler ("gb2312", gbk_input, gbk_output);
	xmlNewCharEncodingHandler ("gbk", gbk_input, gbk_output);
}

经过试验,这样我们就可以解析编码类型为gbk和gb2312的xml文件了。可以进行输入和输出,不过输出的utf-8形式让人觉得有点难过……