Shell's Home

Sep 28, 2007 - 2 minute read - Comments

libxml2入门和中文支持

libxml2是gnome做的一个xml的库,支持SAX和DOM。

在解析xml的时候,libxml2会将xml中不属于标签的部分作为text节点插入。如果是属性,则添加一个属性节点到父节点上,一个文字节点到属性节点下。那么整个xml就变成了一颗单纯的树。

支持多语言的问题上,libxml2的内核只支持UTF-8。但是可以通过注册编码句柄来添加语言支持,一般是配合iconv[2]使用的,因为libxml2的编译依赖就是iconv。下面是代码。

iconv_t iconv_utf8_gbk;
iconv_t iconv_gbk_utf8;

int gbk_input (unsigned char *out, int *outlen, const unsigned char
           *in,
           int *inlen)
{
    char *outbuf = (char *) out;
    char *inbuf = (char *) in;
    size_t rslt;
    rslt =  iconv (iconv_utf8_gbk, (const char **) &inbuf, (size_t *)
               inlen,
               &outbuf, (size_t *) outlen);
    if (rslt < 0)
        return rslt;
    *outlen = ((unsigned char *) outbuf - out);
    *inlen = ((unsigned char *) inbuf - in);
    return *outlen;
}

int gbk_output (unsigned char *out,
        int *outlen, const unsigned char *in, int *inlen)
{
    char *outbuf = (char *) out;
    char *inbuf = (char *) in;
    size_t rslt;
    rslt =  iconv (iconv_gbk_utf8, (const char **) &inbuf, (size_t *)
               inlen,
               &outbuf, (size_t *) outlen);
    if (rslt < 0)
        return rslt;
    *outlen = ((unsigned char *) outbuf - out);
    *inlen = ((unsigned char *) inbuf - in);
    return *outlen;
}

static void print_element_names (xmlDocPtr doc, xmlNode * a_node,
                 int n)
{
    xmlNode *cur_node = NULL;
    xmlAttr   *cur_attr = NULL;
    xmlChar *key;
    for (cur_node = a_node; cur_node; cur_node = cur_node->next)
    {
        for (int i = 0; i < n; ++i)
            printf (" ");
//   key = xmlNodeListGetString(doc,
        cur_node->xmlChildrenNode, 1);
    printf ("node %d: %s = %sn", cur_node->type,
        cur_node->name, cur_node->content);
    if (cur_node->properties != NULL){
        for (cur_attr = cur_node->properties; cur_attr;
             cur_attr = cur_attr->next){
            printf ("attr %s = %sn", cur_attr->name,
                cur_attr->children->content);
        }
    }
//   xmlFree(key);
    if (cur_node->type == XML_ELEMENT_NODE)
        print_element_names (doc, cur_node->children, n + 1);
}

int _tmain (int argc, _TCHAR * argv[])
{
    xmlDoc *doc = NULL;
    xmlNode *root_element = NULL;
    LIBXML_TEST_VERSION;
    iconv_utf8_gbk = iconv_open ("utf-8", "gbk");
    iconv_gbk_utf8 = iconv_open ("gbk", "utf-8");
    xmlNewCharEncodingHandler ("gb2312", gbk_input,
                   gbk_output);//添加gb2312编码支持
    xmlNewCharEncodingHandler ("gbk", gbk_input,
                   gbk_output);//添加gbk编码支持
    doc = xmlReadFile ("Q.xml", NULL, 0);
    if (doc == NULL) {
        printf ("Failed to parse %sn", "Q.xml");
        return 0;
    }
    root_element = xmlDocGetRootElement (doc);
    print_element_names (doc, root_element, 0);
    xmlFreeDoc (doc);
    xmlCleanupParser ();
    xmlMemoryDump ();
    iconv_close (iconv_gbk_utf8);
    iconv_close (iconv_utf8_gbk);
    return 0;
}

在XML无指定编码的时候没有测试过,不过按照XML标准,这时候应该是UTF-8编码。不符合标准的话……自己想办法吧。

Reference:

  1. The XML C parser and toolkit of Gnome: http://xmlsoft.org/

  2. libiconv: http://www.gnu.org/savannah-checkouts/gnu/libiconv/

Tags: c codec program

wget介绍 骑马

comments powered by Disqus