VB 访问UTF文本文件的模块 | 雨律在线

支持UTF文本文件访问的模块
支持常见的ANSI、UTF-8、UTF-16LE、UTF-16BE这几种编码文本

【原理】
以二进制方式打开,判断BOM标记,自己写格式转换程序
对于UTF-8
可以以用MultiByteToWideChar将其转为Unicode格式,使用Windows2000新增代码页65001
对于UTF-16LE
VB的String用的就是UTF-16LE格式,先用字节数组读取文件内容,再直接给字符串变量赋值(sText = byBuf)
对于UTF-16BE
这是大端方式的UTF-16,先还是用字节数组读取文件内容,然后在字节数组中两个两个地交换相邻字节,再直接给字符串变量赋值
对于UTF-32
UTF-32采用的是4字节编码,只能手动转换,幸亏其不多见。

【代码】

Option Explicit

'mTextUTF.bas
'模块:UTF文本文件访问
'作者:zyl910
'版本:1.0
'日期:2006-1-23

'== 说明 ===================================================
'支持Unicode编码的文本文件读写。暂时支持ANSI、UTF-8、UTF-16LE、UTF-16BE这几种编码文本

'== 更新记录 ===============================================
'[V1.0] 2006-1-23
'1.支持最常见的ANSI、UTF-8、UTF-16LE、UTF-16BE这几种编码文本

'## 编译预处理常数 #########################################
'== 全局常数 ===============================================
'IncludeAPILib:引用了API库,此时不需要手动写API声明

'## API ####################################################
#If IncludeAPILib = 0 Then
'== File ===================================================
Private Declare Function CreateFile()Function CreateFile Lib "kernel32" Alias "CreateFileA" (ByVal lpFileName As String, ByVal dwDesiredAccess As Long, ByVal dwShareMode As Long, ByVal lpSecurityAttributes As Long, ByVal dwCreationDisposition As Long, ByVal dwFlagsAndAttributes As Long, ByVal hTemplateFile As Long) As Long
Private Declare Function
CloseHandle()Function CloseHandle Lib "kernel32" (ByVal hObject As Long) As Long
Private Declare Function
ReadFile()Function ReadFile Lib "kernel32" (ByVal hFile As Long, lpBuffer As Any, ByVal nNumberOfBytesToRead As Long, lpNumberOfBytesRead As Long, ByVal lpOverlapped As Long) As Long
Private Declare Function
WriteFile()Function WriteFile Lib "kernel32" (ByVal hFile As Long, lpBuffer As Any, ByVal nNumberOfBytesToWrite As Long, lpNumberOfBytesWritten As Long, ByVal lpOverlapped As Long) As Long
Private Declare Function
GetFileSize()Function GetFileSize Lib "kernel32" (ByVal hFile As Long, lpFileSizeHigh As Long) As Long
Private Declare Function
SetFilePointer()Function SetFilePointer Lib "kernel32" (ByVal hFile As Long, ByVal lDistanceToMove As Long, lpDistanceToMoveHigh As Long, ByVal dwMoveMethod As Long) As Long

Private Const
INVALID_HANDLE_VALUE = -1

Private Const GENERIC_READ = &H80000000
Private Const GENERIC_WRITE = &H40000000

Private Const FILE_SHARE_READ = &H1
Private Const FILE_SHARE_WRITE = &H2

Private Const Create_NEW = 1
Private Const Create_ALWAYS = 2
Private Const OPEN_EXISTING = 3
Private Const OPEN_ALWAYS = 4
Private Const TRUNCATE_EXISTING = 5

Priv
ate Const
FILE_ATTRIBUTE_NORMAL = &H80

Private Const FILE_BEGIN = 0
Private Const FILE_CURRENT = 1
Private Const FILE_END = 2

'== Unicode ================================================

Private Declare Function MultiByteToWideChar()Function MultiByteToWideChar Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByRef lpWideCharStr As Any, ByVal cchWideChar As Long) As Long
Private Declare Function
WideCharToMultiByte()Function WideCharToMultiByte Lib "kernel32" (ByVal CodePage As Long, ByVal dwFlags As Long, ByRef lpWideCharStr As Any, ByVal cchWideChar As Long, ByRef lpMultiByteStr As Any, ByVal cchMultiByte As Long, ByRef lpDefaultChar As Any, ByVal lpUsedDefaultChar As Long) As Long

Private Const
CP_UTF8 As Long = 65001

#End If

'###########################################################

'Unicode编码格式
Public Enum UnicodeEncodeFormatEnum UnicodeEncodeFormat
UEF_ANSI =
0 'ANSI+DBCS
UEF_UTF8 'UTF-8
UEF_UTF16LE 'UTF-16LE
UEF_UTF16BE 'UTF-16BE
UEF_UTF32LE 'UTF-32LE
UEF_UTF32BE 'UTF-32BE

UEF_Auto = -1 '自动识别编码

'隐藏项目
[_UEF_Min] = UEF_ANSI
[_UEF_Max] = UEF_UTF32BE

End Enum

'ANSI+DBCS方式的文本所使用的代码页。默认为0,表示使用系统当前代码页。可以利用该参数实现读取其他代码编码的文本,比如想在 简体中文平台下 读取 繁体中文平台生成的txt,就将它设为950
Public UEFCodePage As Long

'判断BOM
'返回值:BOM所占字节
'dwFirst:[in]文件最开始的4个字节
'fmt:[out]返回编码类型
Public Function UEFCheckBOM()Function UEFCheckBOM(ByVal dwFirst As Long, ByRef fmt As UnicodeEncodeFormat) As Long
If
dwFirst = &HFEFF& Then
fmt = UEF_UTF32LE
UEFCheckBOM =
4
ElseIf dwFirst = &HFFFE0000 Then
fmt = UEF_UTF32BE
UEFCheckBOM =
4
ElseIf (dwFirst And &HFFFF&) = &HFEFF& Then
fmt = UEF_UTF16LE
UEFCheckBOM =
2
ElseIf (dwFirst And &HFFFF&) = &HFFFE& Then
fmt = UEF_UTF16BE
UEFCheckBOM =
2
ElseIf (dwFirst And &HFFFFFF) = &HBFBBEF Then
fmt = UEF_UTF8
UEFCheckBOM =
3
Else
fmt = UEF_ANSI
UEFCheckBOM =
0
End If
End Function

'生成BOM
'返回值:BOM所占字节
'fmt:[in]编码类型
'dwFirst:[out]文件最开始的4个字节
Public Function UEFMakeBOM()Function UEFMakeBOM(ByVal fmt As UnicodeEncodeFormat, ByRef dwFirst As Long) As Long
Selec
t Case
fmt
Case UEF_UTF8
dwFirst =
&HBFBBEF
UEFMakeBOM = 3
Case UEF_UTF16LE
dwFirst =
&HFEFF&
UEFMakeBOM =
2
Case UEF_UTF16BE
dwFirst =
&HFFFE&
UEFMakeBOM =
2
Case UEF_UTF32LE
dwFirst =
&HFEFF&
UEFMakeBOM =
4
Case UEF_UTF32BE
dwFirst =
&HFFFE0000
UEFMakeBOM = 4
Case Else
dwFirst = 0
UEFMakeBOM = 0
End Select
End Function

'判断文本文件的编码类型
'返回值:编码类型。文件无法打开时,返回UEF_Auto
'FileName:文件名
Public Function UEFCheckTextFileFormat()Function UEFCheckTextFileFormat(ByVal FileName As String) As UnicodeEncodeFormat
Dim hFile As Long
Dim
dwFirst As Long
Dim
nNumRead As Long

'打开文件
hFile = CreateFile(FileName, GENERIC_READ, FILE_SHARE_READ Or FILE_SHARE_WRITE, ByVal 0&, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, ByVal 0&)
If INVALID_HANDLE_VALUE = hFile Then '文件无法打开
UEFCheckTextFileFormat = UEF_Auto
Exit Function
End If

'判断BOM
dwFirst = 0
Call ReadFile(hFile, dwFirst, 4, nNumRead, ByVal 0&)
nNumRead = UEFCheckBOM(dwFirst, UEFCheckTextFileFormat)
'Debug.Print nNumRead

'关闭文件
Call CloseHandle(hFile)

End Function

'读取文本文件
'返回值:读取的文本。返回vbNullString表示文件无法打开
'FileName:[in]文件名
'fmt:[in,out]使用何种文本编码格式来读取文本。为UEF_Auto时表示自动判断,且在fmt参数返回文本所用编码格式
Public Function UEFLoadTextFile()Function UEFLoadTextFile(ByVal FileName As String, Optional ByRef fmt As UnicodeEncodeFormat = UEF_Auto) As String
Dim
hFile As Long
Dim
nFileSize As Long
Dim
nNumRead As Long
Dim
dwFirst As Long
Dim
CurFmt As UnicodeEncodeFormat
Dim cbBOM As Long
Dim
cbTextData As Long
Dim
CurCP As Long
Dim
byBuf() As Byte
Dim
cchStr As Long
Dim
I As Long
Dim
byTemp As Byte

'判断fmt范围
If fmt <> UEF_Auto Then
If
fmt < [_UEF_Min] Or fmt > [_UEF_Max] Then
GoTo
FunEnd
End If
End If

'打开文件
hFile = CreateFile(FileName, GENERIC_READ, FILE_SHARE_READ Or FILE_SHARE_WRITE, ByVal 0&, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, ByVal 0&)
If INVALID_HANDLE_VALUE = hFile Then '文件无法打开
GoTo FunEnd
End If

'判断文件大小
nFileSize = GetFileSize(hFile, nNumRead)
If nNumRead <> 0 Then '超过4GB
GoTo FreeHandle
End If
If
nFileSize < 0 Then '超过2GB
GoTo FreeHandle
End If

'判断BOM
dwFirst = 0
Call ReadFile(hFile, dwFirst, 4, nNumRead, ByVal 0&)
cbBOM = UEFCheckBOM(dwFirst, Cur
Fmt)

'恢复文件指针
If fmt = UEF_Auto Then '自动判断
fmt = CurFmt
'cbBOM = cbBOM
Else '手动设置编码
If fmt = CurFmt Then '若编码相同,则忽略BOM标记
'cbBOM = cbBOM
Else '编码不同,那么都是数据
cbBOM = 0
End If
End If
Call
SetFilePointer(hFile, cbBOM, ByVal 0&, FILE_BEGIN)
cbTextData = nFileSize - cbBOM

'读取数据
UEFLoadTextFile = ""
Select Case fmt
Case UEF_ANSI, UEF_UTF8
'判断应使用的CodePage
CurCP = IIf(fmt = UEF_UTF8, CP_UTF8, UEFCodePage)

'分配缓冲区
On Error GoTo FreeHandle
ReDim byBuf(0 To cbTextData - 1)
On Error GoTo 0

'读取数据
nNumRead = 0
Call ReadFile(hFile, byBuf(0), cbTextData, nNumRead, ByVal 0&)

'取得Unicode文本长度
cchStr = MultiByteToWideChar(CurCP, 0, byBuf(0), nNumRead, ByVal 0&, ByVal 0&)
If cchStr > 0 Then
'分配字符串空间
On Error GoTo FreeHandle
UEFLoadTextFile =
String$(cchStr, 0)
On Error GoTo 0

'取得文本
cchStr = MultiByteToWideChar(CurCP, 0, byBuf(0), nNumRead, ByVal StrPtr(UEFLoadTextFile), cchStr + 1)

End If

Case
UEF_UTF16LE
cchStr = (cbTextData +
1) 2

'分配字符串空间
On Error GoTo FreeHandle
UEFLoadTextFile =
String$(cchStr, 0)
On Error GoTo 0

'取得文本
nNumRead = 0
Call ReadFile(hFile, ByVal StrPtr(UEFLoadTextFile), cbTextData, nNumRead, ByVal 0&)

'修正文本长度
cchStr = (nNumRead + 1) 2
If cchStr > 0 Then
If
Len(UEFLoadTextFile) > cchStr Then
UEFLoadTextFile = Left$(UEFLoadTextFile, cchStr)
End If
Else
UEFLoadTextFile = ""
End If

Case
UEF_UTF16BE
'分配缓冲区
On Error GoTo FreeHandle
ReDim byBuf(0 To cbTextData - 1)
On Error GoTo 0

'读取数据
nNumRead = 0
Call ReadFile(hFile, byBuf(0), cbTextData, nNumRead, ByVal 0&)

If nNumRead > 0 Then
'隔两字节翻转相邻字节
For I = 0 To nNumRead - 1 - 1 Step 2 '再-1是为了避免最后多出的那个字节
byTemp = byBuf(I)
byBuf(I) = byBuf(I +
1)
byBuf(I +
1) = byTemp
Next I

'取得文本
UEFLoadTextFile = byBuf 'VB允许String中的字符串数

 
目前有0条回应
Comment
Trackback
你目前的身份是游客,请输入昵称和电邮!