newStringUTF出现input is not valid Modified UTF-8错误解决办法

在使用Jni的JNIEnv->NewStringUTF的时候抛出了异常"JNI DETECTED ERROR IN APPLICATION: input is not valid Modified UTF-8: illegal start byte 0xfe "。网上搜索了一下,这个异常是因为Java虚拟机内部的dalvik/vm/CheckJni.c中的checkUtfString函数抛出的,而且JVM的这个接口明确是不支持四个字节的UTF8字符。所以须要在调用函数以前,对接口传入的字符串进行过滤,过滤函数以下:函数

int checkUtfString(const char* bytes)
{
    const char* origBytes = bytes;
    if (bytes == NULL) {
        return -1;
    }
    while (*bytes != '\0') {
       unsigned char utf8 = *(bytes++);
        // Switch on the high four bits.
        switch (utf8 >> 4) {
            case 0x00:
            case 0x01:
            case 0x02:
            case 0x03:
            case 0x04:
            case 0x05:
            case 0x06:
            case 0x07: {
                // Bit pattern 0xxx. No need for any extra bytes.
                break;
            }
            case 0x08:
            case 0x09:
            case 0x0a:
            case 0x0b:
            case 0x0f: {
                /*printf("****JNI WARNING: illegal start byte 0x%x\n", utf8);*/
                return -1;
            }
            case 0x0e: {
                // Bit pattern 1110, so there are two additional bytes.
                utf8 = *(bytes++);
                if ((utf8 & 0xc0) != 0x80) {
                    /*printf("****JNI WARNING: illegal continuation byte 0x%x\n", utf8);*/
                    return -1;
                }
                // Fall through to take care of the final byte.
            }
            case 0x0c:
            case 0x0d: {
                // Bit pattern 110x, so there is one additional byte.
                utf8 = *(bytes++);
                if ((utf8 & 0xc0) != 0x80) {
                    /*printf("****JNI WARNING: illegal continuation byte 0x%x\n", utf8);*/
                    return -1;
                }
                break;
            }
        }
    }
    return 0;
}

把这个函数改了下,当作一个修正函数,下面是代码:code

void correctUtfBytes(char* bytes) {
  char three = 0;
   while (*bytes != '\0') {
	   unsigned char utf8 = *(bytes++);
	   three = 0;
	   // Switch on the high four bits.
	   switch (utf8 >> 4) {
	   case 0x00:
	   case 0x01:
	   case 0x02:
	   case 0x03:
	   case 0x04:
	   case 0x05:
	   case 0x06:
	   case 0x07:
		   // Bit pattern 0xxx. No need for any extra bytes.
		   break;
	   case 0x08:
	   case 0x09:
	   case 0x0a:
	   case 0x0b:
	   case 0x0f:
		   /*
			* Bit pattern 10xx or 1111, which are illegal start bytes.
			* Note: 1111 is valid for normal UTF-8, but not the
			* modified UTF-8 used here.
			*/
		   *(bytes-1) = '?';
		   break;
	   case 0x0e:
		   // Bit pattern 1110, so there are two additional bytes.
		   utf8 = *(bytes++);
		   if ((utf8 & 0xc0) != 0x80) {
			   --bytes;
			   *(bytes-1) = '?';
			   break;
		   }
		   three = 1;
		   // Fall through to take care of the final byte.
	   case 0x0c:
	   case 0x0d:
		   // Bit pattern 110x, so there is one additional byte.
		   utf8 = *(bytes++);
		   if ((utf8 & 0xc0) != 0x80) {
			   --bytes;
			   if(three)--bytes;
			   *(bytes-1)='?';
		   }
		   break;
	   }
   }
}