libxml2是gnome做的一个xml的库,支持SAX和DOM。

在解析xml的时候,libxml2会将xml中不属于标签的部分作为text节点插入。如果是属性,则添加一个属性节点到父节点上,一个文字节点到属性节点下。那么整个xml就变成了一颗单纯的树。

支持多语言的问题上,libxml2的内核只支持UTF-8。但是可以通过注册编码句柄来添加语言支持,一般是配合iconv[2]使用的,因为libxml2的编译依赖就是iconv。下面是代码。

iconv_t iconv_utf8_gbk;
iconv_t iconv_gbk_utf8;

int gbk_input (unsigned char *out, int *outlen, const unsigned char
		   *in,
		   int *inlen)
{
	char *outbuf = (char *) out;
	char *inbuf = (char *) in;
	size_t rslt;
	rslt =	iconv (iconv_utf8_gbk, (const char **) &inbuf, (size_t *)
			   inlen,
			   &outbuf, (size_t *) outlen);
	if (rslt < 0)
		return rslt;
	*outlen = ((unsigned char *) outbuf - out);
	*inlen = ((unsigned char *) inbuf - in);
	return *outlen;
}

int gbk_output (unsigned char *out,
		int *outlen, const unsigned char *in, int *inlen)
{
	char *outbuf = (char *) out;
	char *inbuf = (char *) in;
	size_t rslt;
	rslt =	iconv (iconv_gbk_utf8, (const char **) &inbuf, (size_t *)
			   inlen,
			   &outbuf, (size_t *) outlen);
	if (rslt < 0)
		return rslt;
	*outlen = ((unsigned char *) outbuf - out);
	*inlen = ((unsigned char *) inbuf - in);
	return *outlen;
}

static void print_element_names (xmlDocPtr doc, xmlNode * a_node,
				 int n)
{
	xmlNode *cur_node = NULL;
	xmlAttr   *cur_attr = NULL;
	xmlChar *key;
	for (cur_node = a_node; cur_node; cur_node = cur_node->next)
	{
		for (int i = 0; i < n; ++i)
			printf (" ");
//   key = xmlNodeListGetString(doc,
		cur_node->xmlChildrenNode, 1);
	printf ("node %d: %s = %sn", cur_node->type,
		cur_node->name, cur_node->content);
	if (cur_node->properties != NULL){
		for (cur_attr = cur_node->properties; cur_attr;
			 cur_attr = cur_attr->next){
			printf ("attr %s = %sn", cur_attr->name,
				cur_attr->children->content);
		}
	}
//   xmlFree(key);
	if (cur_node->type == XML_ELEMENT_NODE)
		print_element_names (doc, cur_node->children, n + 1);
}

int _tmain (int argc, _TCHAR * argv[])
{
	xmlDoc *doc = NULL;
	xmlNode *root_element = NULL;
	LIBXML_TEST_VERSION;
	iconv_utf8_gbk = iconv_open ("utf-8", "gbk");
	iconv_gbk_utf8 = iconv_open ("gbk", "utf-8");
	xmlNewCharEncodingHandler ("gb2312", gbk_input,
				   gbk_output);//添加gb2312编码支持
	xmlNewCharEncodingHandler ("gbk", gbk_input,
				   gbk_output);//添加gbk编码支持
	doc = xmlReadFile ("Q.xml", NULL, 0);
	if (doc == NULL) {
		printf ("Failed to parse %sn", "Q.xml");
		return 0;
	}
	root_element = xmlDocGetRootElement (doc);
	print_element_names (doc, root_element, 0);
	xmlFreeDoc (doc);
	xmlCleanupParser ();
	xmlMemoryDump ();
	iconv_close (iconv_gbk_utf8);
	iconv_close (iconv_utf8_gbk);
	return 0;
}

在XML无指定编码的时候没有测试过,不过按照XML标准,这时候应该是UTF-8编码。不符合标准的话……自己想办法吧。

Reference:

  1. The XML C parser and toolkit of Gnome: http://xmlsoft.org/

  2. libiconv: http://www.gnu.org/savannah-checkouts/gnu/libiconv/