CHttpFile读取GBK网页时乱码
在VS2008下,项目字符集是Unicode,CHttpFile读取GBK网页时乱码. 我查看字符串是用的断点,在运行时看字符串值的。
读取gb2312网页strHtml直接就是中文。读取UTF-8网页时,用ConvertUTF8toGB2312可以转成中文。
读取gbk,如sohu网页时就乱码了。我仿ConvertUTF8toGB2312写了个ConvertUTF8toGBK没效果,还是乱码。
#include <stdafx.h>#include <afxinet.h>//将UTF8字符串转换为gb2312 CString ConvertUTF8toGB2312(const char *pData, size_t size){ size_t n = MultiByteToWideChar(CP_UTF8, 0, pData, (int)size, NULL, 0); WCHAR * pChar = new WCHAR[n+1]; n = MultiByteToWideChar(CP_UTF8, 0, pData, (int)size, pChar, n); pChar[n]=0; n = WideCharToMultiByte(936, 0, pChar, -1, 0, 0, 0, 0); char *p = new char[n+1]; n = WideCharToMultiByte(936, 0, pChar, -1, p, (int)n, 0, 0); CString result(p); delete []pChar; delete []p; return result;} CString ConvertUTF8toGBK(const char *pData, size_t size){ size_t n = MultiByteToWideChar(CP_UTF8, 0, pData, (int)size, NULL, 0); WCHAR * pChar = new WCHAR[n+1]; n = MultiByteToWideChar(CP_UTF8, 0, pData, (int)size, pChar, n); pChar[n]=0; n = WideCharToMultiByte(20936, 0, pChar, -1, 0, 0, 0, 0); char *p = new char[n+1]; n = WideCharToMultiByte(20936, 0, pChar, -1, p, (int)n, 0, 0); CString result(p); delete []pChar; delete []p; return result;} CString GetPageHtml(CString strUrl) { CString strHtml = _T("");//获取HTML CInternetSession session(L"HttpClient"); //CString strUrl = L"http://www.yahoo.com.cn"; CHttpFile* pFile = (CHttpFile*)session.OpenURL((LPCTSTR)strUrl); DWORD dwStatusCode; pFile -> QueryInfoStatusCode(dwStatusCode); if(dwStatusCode == HTTP_STATUS_OK) { char sRecived[1024]; while(pFile->Read((LPTSTR)sRecived, 1024) != NULL) { strHtml += sRecived; } //if(strHtml.Find(L"utf-8") > -1) { USES_CONVERSION; char *pChar = W2A(strHtml.GetBuffer()); CString tt= UTF82Ansi(pChar); [color=#FF0000]//CString strTest = ConvertUTF8toGBK(pChar,strlen(pChar));[/color] CString strTemp = ConvertUTF8toGB2312(pChar,strlen(pChar)); strHtml = strTemp; } } pFile -> Close(); delete pFile; session.Close(); return strHtml;}int _tmain(int argc, _TCHAR* argv[]){ CString x = GetPageHtml("http://tv.sohu.com/20111017/n322381354.shtml"); return 0 ;}