0%

C#自动识别文件编码 - 晓晨Master - 博客园

Excerpt

在做导入微信商户后台退款数据时,无论怎么设置编码导出来都是乱码,后来在网上找了这个识别文件编码的代码,感觉不错。 最后识别出来是gb2312,看来我还是太渣了,只能吃土了,竟然忘记了这个编码。 下面,上代码。


在做导入微信商户后台退款数据时,无论怎么设置编码导出来都是乱码,后来在网上找了这个识别文件编码的代码,感觉不错。

最后识别出来是gb2312,看来我还是太渣了,只能吃土了,竟然忘记了这个编码。

下面,上代码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
<span>  1</span> <span>///</span> <span>&lt;summary&gt;</span>   
<span> 2</span> <span>///</span><span> 用于取得一个文本文件的编码方式(Encoding)。
</span><span> 3</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span> 4</span> <span>public</span> <span>class</span><span> TxtFileEncoder
</span><span> 5</span> <span> {
</span><span> 6</span> <span>public</span><span> TxtFileEncoder()
</span><span> 7</span> <span> {
</span><span> 8</span> <span>//</span>
<span> 9</span> <span>//</span><span> TODO: 在此处添加构造函数逻辑
</span><span> 10</span> <span>//</span>
<span> 11</span> <span> }
</span><span> 12</span> <span>///</span> <span>&lt;summary&gt;</span>
<span> 13</span> <span>///</span><span> 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符,Encoding.Default将被返回。
</span><span> 14</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span> 15</span> <span>///</span> <span>&lt;param name="fileName"&gt;</span><span>文件名。</span><span>&lt;/param&gt;</span>
<span> 16</span> <span>///</span> <span>&lt;returns&gt;&lt;/returns&gt;</span>
<span> 17</span> <span>public</span> <span>static</span> Encoding GetEncoding(<span>string</span><span> fileName)
</span><span> 18</span> <span> {
</span><span> 19</span> <span>return</span><span> GetEncoding(fileName, Encoding.Default);
</span><span> 20</span> <span> }
</span><span> 21</span> <span>///</span> <span>&lt;summary&gt;</span>
<span> 22</span> <span>///</span><span> 取得一个文本文件流的编码方式。
</span><span> 23</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span> 24</span> <span>///</span> <span>&lt;param name="stream"&gt;</span><span>文本文件流。</span><span>&lt;/param&gt;</span>
<span> 25</span> <span>///</span> <span>&lt;returns&gt;&lt;/returns&gt;</span>
<span> 26</span> <span>public</span> <span>static</span><span> Encoding GetEncoding(FileStream stream)
</span><span> 27</span> <span> {
</span><span> 28</span> <span>return</span><span> GetEncoding(stream, Encoding.Default);
</span><span> 29</span> <span> }
</span><span> 30</span> <span>///</span> <span>&lt;summary&gt;</span>
<span> 31</span> <span>///</span><span> 取得一个文本文件的编码方式。
</span><span> 32</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span> 33</span> <span>///</span> <span>&lt;param name="fileName"&gt;</span><span>文件名。</span><span>&lt;/param&gt;</span>
<span> 34</span> <span>///</span> <span>&lt;param name="defaultEncoding"&gt;</span><span>默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</span><span>&lt;/param&gt;</span>
<span> 35</span> <span>///</span> <span>&lt;returns&gt;&lt;/returns&gt;</span>
<span> 36</span> <span>public</span> <span>static</span> Encoding GetEncoding(<span>string</span><span> fileName, Encoding defaultEncoding)
</span><span> 37</span> <span> {
</span><span> 38</span> FileStream fs = <span>new</span><span> FileStream(fileName, FileMode.Open);
</span><span> 39</span> Encoding targetEncoding =<span> GetEncoding(fs, defaultEncoding);
</span><span> 40</span> <span> fs.Close();
</span><span> 41</span> <span>return</span><span> targetEncoding;
</span><span> 42</span> <span> }
</span><span> 43</span> <span>///</span> <span>&lt;summary&gt;</span>
<span> 44</span> <span>///</span><span> 取得一个文本文件流的编码方式。
</span><span> 45</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span> 46</span> <span>///</span> <span>&lt;param name="stream"&gt;</span><span>文本文件流。</span><span>&lt;/param&gt;</span>
<span> 47</span> <span>///</span> <span>&lt;param name="defaultEncoding"&gt;</span><span>默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</span><span>&lt;/param&gt;</span>
<span> 48</span> <span>///</span> <span>&lt;returns&gt;&lt;/returns&gt;</span>
<span> 49</span> <span>public</span> <span>static</span><span> Encoding GetEncoding(FileStream stream, Encoding defaultEncoding)
</span><span> 50</span> <span> {
</span><span> 51</span> Encoding targetEncoding =<span> defaultEncoding;
</span><span> 52</span> <span>if</span> (stream != <span>null</span> &amp;&amp; stream.Length &gt;= <span>2</span><span>)
</span><span> 53</span> <span> {
</span><span> 54</span> <span>//</span><span>保存文件流的前4个字节 </span>
<span> 55</span> <span>byte</span> byte1 = <span>0</span><span>;
</span><span> 56</span> <span>byte</span> byte2 = <span>0</span><span>;
</span><span> 57</span> <span>byte</span> byte3 = <span>0</span><span>;
</span><span> 58</span> <span>byte</span> byte4 = <span>0</span><span>;
</span><span> 59</span> <span>//</span><span>保存当前Seek位置 </span>
<span> 60</span> <span>long</span> origPos = stream.Seek(<span>0</span><span>, SeekOrigin.Begin);
</span><span> 61</span> stream.Seek(<span>0</span><span>, SeekOrigin.Begin);
</span><span> 62</span>
<span> 63</span> <span>int</span> nByte =<span> stream.ReadByte();
</span><span> 64</span> byte1 =<span> Convert.ToByte(nByte);
</span><span> 65</span> byte2 =<span> Convert.ToByte(stream.ReadByte());
</span><span> 66</span> <span>if</span> (stream.Length &gt;= <span>3</span><span>)
</span><span> 67</span> <span> {
</span><span> 68</span> byte3 =<span> Convert.ToByte(stream.ReadByte());
</span><span> 69</span> <span> }
</span><span> 70</span> <span>if</span> (stream.Length &gt;= <span>4</span><span>)
</span><span> 71</span> <span> {
</span><span> 72</span> byte4 =<span> Convert.ToByte(stream.ReadByte());
</span><span> 73</span> <span> }
</span><span> 74</span> <span>//</span><span>根据文件流的前4个字节判断Encoding
</span><span> 75</span> <span>//</span><span>Unicode {0xFF, 0xFE};
</span><span> 76</span> <span>//</span><span>BE-Unicode {0xFE, 0xFF};
</span><span> 77</span> <span>//</span><span>UTF8 = {0xEF, 0xBB, 0xBF}; </span>
<span> 78</span> <span>if</span> (byte1 == <span>0xFE</span> &amp;&amp; byte2 == <span>0xFF</span>)<span>//</span><span>UnicodeBe </span>
<span> 79</span> <span> {
</span><span> 80</span> targetEncoding =<span> Encoding.BigEndianUnicode;
</span><span> 81</span> <span> }
</span><span> 82</span> <span>if</span> (byte1 == <span>0xFF</span> &amp;&amp; byte2 == <span>0xFE</span> &amp;&amp; byte3 != <span>0xFF</span>)<span>//</span><span>Unicode </span>
<span> 83</span> <span> {
</span><span> 84</span> targetEncoding =<span> Encoding.Unicode;
</span><span> 85</span> <span> }
</span><span> 86</span> <span>if</span> (byte1 == <span>0xEF</span> &amp;&amp; byte2 == <span>0xBB</span> &amp;&amp; byte3 == <span>0xBF</span>)<span>//</span><span>UTF8 </span>
<span> 87</span> <span> {
</span><span> 88</span> targetEncoding =<span> Encoding.UTF8;
</span><span> 89</span> <span> }
</span><span> 90</span> <span>//</span><span>恢复Seek位置 </span>
<span> 91</span> <span> stream.Seek(origPos, SeekOrigin.Begin);
</span><span> 92</span> <span> }
</span><span> 93</span> <span>return</span><span> targetEncoding;
</span><span> 94</span> <span> }
</span><span> 95</span>
<span> 96</span>
<span> 97</span>
<span> 98</span> <span>//</span><span> 新增加一个方法,解决了不带BOM的 UTF8 编码问题 </span>
<span> 99</span>
<span>100</span> <span>///</span> <span>&lt;summary&gt;</span>
<span>101</span> <span>///</span><span> 通过给定的文件流,判断文件的编码类型
</span><span>102</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span>103</span> <span>///</span> <span>&lt;param name="fs"&gt;</span><span>文件流</span><span>&lt;/param&gt;</span>
<span>104</span> <span>///</span> <span>&lt;returns&gt;</span><span>文件的编码类型</span><span>&lt;/returns&gt;</span>
<span>105</span> <span>public</span> <span>static</span><span> System.Text.Encoding GetEncoding(Stream fs)
</span><span>106</span> <span> {
</span><span>107</span> <span>byte</span>[] Unicode = <span>new</span> <span>byte</span>[] { <span>0xFF</span>, <span>0xFE</span>, <span>0x41</span><span> };
</span><span>108</span> <span>byte</span>[] UnicodeBIG = <span>new</span> <span>byte</span>[] { <span>0xFE</span>, <span>0xFF</span>, <span>0x00</span><span> };
</span><span>109</span> <span>byte</span>[] UTF8 = <span>new</span> <span>byte</span>[] { <span>0xEF</span>, <span>0xBB</span>, <span>0xBF</span> }; <span>//</span><span>带BOM </span>
<span>110</span> Encoding reVal =<span> Encoding.Default;
</span><span>111</span>
<span>112</span> BinaryReader r = <span>new</span><span> BinaryReader(fs, System.Text.Encoding.Default);
</span><span>113</span> <span>byte</span>[] ss = r.ReadBytes(<span>4</span><span>);
</span><span>114</span> <span>if</span> (ss[<span>0</span>] == <span>0xFE</span> &amp;&amp; ss[<span>1</span>] == <span>0xFF</span> &amp;&amp; ss[<span>2</span>] == <span>0x00</span><span>)
</span><span>115</span> <span> {
</span><span>116</span> reVal =<span> Encoding.BigEndianUnicode;
</span><span>117</span> <span> }
</span><span>118</span> <span>else</span> <span>if</span> (ss[<span>0</span>] == <span>0xFF</span> &amp;&amp; ss[<span>1</span>] == <span>0xFE</span> &amp;&amp; ss[<span>2</span>] == <span>0x41</span><span>)
</span><span>119</span> <span> {
</span><span>120</span> reVal =<span> Encoding.Unicode;
</span><span>121</span> <span> }
</span><span>122</span> <span>else</span>
<span>123</span> <span> {
</span><span>124</span> <span>if</span> (ss[<span>0</span>] == <span>0xEF</span> &amp;&amp; ss[<span>1</span>] == <span>0xBB</span> &amp;&amp; ss[<span>2</span>] == <span>0xBF</span><span>)
</span><span>125</span> <span> {
</span><span>126</span> reVal =<span> Encoding.UTF8;
</span><span>127</span> <span> }
</span><span>128</span> <span>else</span>
<span>129</span> <span> {
</span><span>130</span> <span>int</span><span> i;
</span><span>131</span> <span>int</span>.TryParse(fs.Length.ToString(), <span>out</span><span> i);
</span><span>132</span> ss =<span> r.ReadBytes(i);
</span><span>133</span>
<span>134</span> <span>if</span><span> (IsUTF8Bytes(ss))
</span><span>135</span> reVal =<span> Encoding.UTF8;
</span><span>136</span> <span> }
</span><span>137</span> <span> }
</span><span>138</span> <span> r.Close();
</span><span>139</span> <span>return</span><span> reVal;
</span><span>140</span>
<span>141</span> <span> }
</span><span>142</span>
<span>143</span> <span>///</span> <span>&lt;summary&gt;</span>
<span>144</span> <span>///</span><span> 判断是否是不带 BOM 的 UTF8 格式
</span><span>145</span> <span>///</span> <span>&lt;/summary&gt;</span>
<span>146</span> <span>///</span> <span>&lt;param name="data"&gt;&lt;/param&gt;</span>
<span>147</span> <span>///</span> <span>&lt;returns&gt;&lt;/returns&gt;</span>
<span>148</span> <span>private</span> <span>static</span> <span>bool</span> IsUTF8Bytes(<span>byte</span><span>[] data)
</span><span>149</span> <span> {
</span><span>150</span> <span>int</span> charByteCounter = <span>1</span>;  <span>//</span><span>计算当前正分析的字符应还有的字节数 </span>
<span>151</span> <span>byte</span> curByte; <span>//</span><span>当前分析的字节. </span>
<span>152</span> <span>for</span> (<span>int</span> i = <span>0</span>; i &lt; data.Length; i++<span>)
</span><span>153</span> <span> {
</span><span>154</span> curByte =<span> data[i];
</span><span>155</span> <span>if</span> (charByteCounter == <span>1</span><span>)
</span><span>156</span> <span> {
</span><span>157</span> <span>if</span> (curByte &gt;= <span>0x80</span><span>)
</span><span>158</span> <span> {
</span><span>159</span> <span>//</span><span>判断当前 </span>
<span>160</span> <span>while</span> (((curByte &lt;&lt;= <span>1</span>) &amp; <span>0x80</span>) != <span>0</span><span>)
</span><span>161</span> <span> {
</span><span>162</span> charByteCounter++<span>;
</span><span>163</span> <span> }
</span><span>164</span> <span>//</span><span>标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X  </span>
<span>165</span> <span>if</span> (charByteCounter == <span>1</span> || charByteCounter &gt; <span>6</span><span>)
</span><span>166</span> <span> {
</span><span>167</span> <span>return</span> <span>false</span><span>;
</span><span>168</span> <span> }
</span><span>169</span> <span> }
</span><span>170</span> <span> }
</span><span>171</span> <span>else</span>
<span>172</span> <span> {
</span><span>173</span> <span>//</span><span>若是UTF-8 此时第一位必须为1 </span>
<span>174</span> <span>if</span> ((curByte &amp; <span>0xC0</span>) != <span>0x80</span><span>)
</span><span>175</span> <span> {
</span><span>176</span> <span>return</span> <span>false</span><span>;
</span><span>177</span> <span> }
</span><span>178</span> charByteCounter--<span>;
</span><span>179</span> <span> }
</span><span>180</span> <span> }
</span><span>181</span> <span>if</span> (charByteCounter &gt; <span>1</span><span>)
</span><span>182</span> <span> {
</span><span>183</span> <span>throw</span> <span>new</span> Exception(<span>"</span><span>非预期的byte格式!</span><span>"</span><span>);
</span><span>184</span> <span> }
</span><span>185</span> <span>return</span> <span>true</span><span>;
</span><span>186</span> <span> }
</span><span>187</span> }