1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
| <span> 1</span> <span>///</span> <span><summary></span> <span> 2</span> <span>///</span><span> 用于取得一个文本文件的编码方式(Encoding)。 </span><span> 3</span> <span>///</span> <span></summary></span> <span> 4</span> <span>public</span> <span>class</span><span> TxtFileEncoder </span><span> 5</span> <span> { </span><span> 6</span> <span>public</span><span> TxtFileEncoder() </span><span> 7</span> <span> { </span><span> 8</span> <span>//</span> <span> 9</span> <span>//</span><span> TODO: 在此处添加构造函数逻辑 </span><span> 10</span> <span>//</span> <span> 11</span> <span> } </span><span> 12</span> <span>///</span> <span><summary></span> <span> 13</span> <span>///</span><span> 取得一个文本文件的编码方式。如果无法在文件头部找到有效的前导符,Encoding.Default将被返回。 </span><span> 14</span> <span>///</span> <span></summary></span> <span> 15</span> <span>///</span> <span><param name="fileName"></span><span>文件名。</span><span></param></span> <span> 16</span> <span>///</span> <span><returns></returns></span> <span> 17</span> <span>public</span> <span>static</span> Encoding GetEncoding(<span>string</span><span> fileName) </span><span> 18</span> <span> { </span><span> 19</span> <span>return</span><span> GetEncoding(fileName, Encoding.Default); </span><span> 20</span> <span> } </span><span> 21</span> <span>///</span> <span><summary></span> <span> 22</span> <span>///</span><span> 取得一个文本文件流的编码方式。 </span><span> 23</span> <span>///</span> <span></summary></span> <span> 24</span> <span>///</span> <span><param name="stream"></span><span>文本文件流。</span><span></param></span> <span> 25</span> <span>///</span> <span><returns></returns></span> <span> 26</span> <span>public</span> <span>static</span><span> Encoding GetEncoding(FileStream stream) </span><span> 27</span> <span> { </span><span> 28</span> <span>return</span><span> GetEncoding(stream, Encoding.Default); </span><span> 29</span> <span> } </span><span> 30</span> <span>///</span> <span><summary></span> <span> 31</span> <span>///</span><span> 取得一个文本文件的编码方式。 </span><span> 32</span> <span>///</span> <span></summary></span> <span> 33</span> <span>///</span> <span><param name="fileName"></span><span>文件名。</span><span></param></span> <span> 34</span> <span>///</span> <span><param name="defaultEncoding"></span><span>默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</span><span></param></span> <span> 35</span> <span>///</span> <span><returns></returns></span> <span> 36</span> <span>public</span> <span>static</span> Encoding GetEncoding(<span>string</span><span> fileName, Encoding defaultEncoding) </span><span> 37</span> <span> { </span><span> 38</span> FileStream fs = <span>new</span><span> FileStream(fileName, FileMode.Open); </span><span> 39</span> Encoding targetEncoding =<span> GetEncoding(fs, defaultEncoding); </span><span> 40</span> <span> fs.Close(); </span><span> 41</span> <span>return</span><span> targetEncoding; </span><span> 42</span> <span> } </span><span> 43</span> <span>///</span> <span><summary></span> <span> 44</span> <span>///</span><span> 取得一个文本文件流的编码方式。 </span><span> 45</span> <span>///</span> <span></summary></span> <span> 46</span> <span>///</span> <span><param name="stream"></span><span>文本文件流。</span><span></param></span> <span> 47</span> <span>///</span> <span><param name="defaultEncoding"></span><span>默认编码方式。当该方法无法从文件的头部取得有效的前导符时,将返回该编码方式。</span><span></param></span> <span> 48</span> <span>///</span> <span><returns></returns></span> <span> 49</span> <span>public</span> <span>static</span><span> Encoding GetEncoding(FileStream stream, Encoding defaultEncoding) </span><span> 50</span> <span> { </span><span> 51</span> Encoding targetEncoding =<span> defaultEncoding; </span><span> 52</span> <span>if</span> (stream != <span>null</span> && stream.Length >= <span>2</span><span>) </span><span> 53</span> <span> { </span><span> 54</span> <span>//</span><span>保存文件流的前4个字节 </span> <span> 55</span> <span>byte</span> byte1 = <span>0</span><span>; </span><span> 56</span> <span>byte</span> byte2 = <span>0</span><span>; </span><span> 57</span> <span>byte</span> byte3 = <span>0</span><span>; </span><span> 58</span> <span>byte</span> byte4 = <span>0</span><span>; </span><span> 59</span> <span>//</span><span>保存当前Seek位置 </span> <span> 60</span> <span>long</span> origPos = stream.Seek(<span>0</span><span>, SeekOrigin.Begin); </span><span> 61</span> stream.Seek(<span>0</span><span>, SeekOrigin.Begin); </span><span> 62</span> <span> 63</span> <span>int</span> nByte =<span> stream.ReadByte(); </span><span> 64</span> byte1 =<span> Convert.ToByte(nByte); </span><span> 65</span> byte2 =<span> Convert.ToByte(stream.ReadByte()); </span><span> 66</span> <span>if</span> (stream.Length >= <span>3</span><span>) </span><span> 67</span> <span> { </span><span> 68</span> byte3 =<span> Convert.ToByte(stream.ReadByte()); </span><span> 69</span> <span> } </span><span> 70</span> <span>if</span> (stream.Length >= <span>4</span><span>) </span><span> 71</span> <span> { </span><span> 72</span> byte4 =<span> Convert.ToByte(stream.ReadByte()); </span><span> 73</span> <span> } </span><span> 74</span> <span>//</span><span>根据文件流的前4个字节判断Encoding </span><span> 75</span> <span>//</span><span>Unicode {0xFF, 0xFE}; </span><span> 76</span> <span>//</span><span>BE-Unicode {0xFE, 0xFF}; </span><span> 77</span> <span>//</span><span>UTF8 = {0xEF, 0xBB, 0xBF}; </span> <span> 78</span> <span>if</span> (byte1 == <span>0xFE</span> && byte2 == <span>0xFF</span>)<span>//</span><span>UnicodeBe </span> <span> 79</span> <span> { </span><span> 80</span> targetEncoding =<span> Encoding.BigEndianUnicode; </span><span> 81</span> <span> } </span><span> 82</span> <span>if</span> (byte1 == <span>0xFF</span> && byte2 == <span>0xFE</span> && byte3 != <span>0xFF</span>)<span>//</span><span>Unicode </span> <span> 83</span> <span> { </span><span> 84</span> targetEncoding =<span> Encoding.Unicode; </span><span> 85</span> <span> } </span><span> 86</span> <span>if</span> (byte1 == <span>0xEF</span> && byte2 == <span>0xBB</span> && byte3 == <span>0xBF</span>)<span>//</span><span>UTF8 </span> <span> 87</span> <span> { </span><span> 88</span> targetEncoding =<span> Encoding.UTF8; </span><span> 89</span> <span> } </span><span> 90</span> <span>//</span><span>恢复Seek位置 </span> <span> 91</span> <span> stream.Seek(origPos, SeekOrigin.Begin); </span><span> 92</span> <span> } </span><span> 93</span> <span>return</span><span> targetEncoding; </span><span> 94</span> <span> } </span><span> 95</span> <span> 96</span> <span> 97</span> <span> 98</span> <span>//</span><span> 新增加一个方法,解决了不带BOM的 UTF8 编码问题 </span> <span> 99</span> <span>100</span> <span>///</span> <span><summary></span> <span>101</span> <span>///</span><span> 通过给定的文件流,判断文件的编码类型 </span><span>102</span> <span>///</span> <span></summary></span> <span>103</span> <span>///</span> <span><param name="fs"></span><span>文件流</span><span></param></span> <span>104</span> <span>///</span> <span><returns></span><span>文件的编码类型</span><span></returns></span> <span>105</span> <span>public</span> <span>static</span><span> System.Text.Encoding GetEncoding(Stream fs) </span><span>106</span> <span> { </span><span>107</span> <span>byte</span>[] Unicode = <span>new</span> <span>byte</span>[] { <span>0xFF</span>, <span>0xFE</span>, <span>0x41</span><span> }; </span><span>108</span> <span>byte</span>[] UnicodeBIG = <span>new</span> <span>byte</span>[] { <span>0xFE</span>, <span>0xFF</span>, <span>0x00</span><span> }; </span><span>109</span> <span>byte</span>[] UTF8 = <span>new</span> <span>byte</span>[] { <span>0xEF</span>, <span>0xBB</span>, <span>0xBF</span> }; <span>//</span><span>带BOM </span> <span>110</span> Encoding reVal =<span> Encoding.Default; </span><span>111</span> <span>112</span> BinaryReader r = <span>new</span><span> BinaryReader(fs, System.Text.Encoding.Default); </span><span>113</span> <span>byte</span>[] ss = r.ReadBytes(<span>4</span><span>); </span><span>114</span> <span>if</span> (ss[<span>0</span>] == <span>0xFE</span> && ss[<span>1</span>] == <span>0xFF</span> && ss[<span>2</span>] == <span>0x00</span><span>) </span><span>115</span> <span> { </span><span>116</span> reVal =<span> Encoding.BigEndianUnicode; </span><span>117</span> <span> } </span><span>118</span> <span>else</span> <span>if</span> (ss[<span>0</span>] == <span>0xFF</span> && ss[<span>1</span>] == <span>0xFE</span> && ss[<span>2</span>] == <span>0x41</span><span>) </span><span>119</span> <span> { </span><span>120</span> reVal =<span> Encoding.Unicode; </span><span>121</span> <span> } </span><span>122</span> <span>else</span> <span>123</span> <span> { </span><span>124</span> <span>if</span> (ss[<span>0</span>] == <span>0xEF</span> && ss[<span>1</span>] == <span>0xBB</span> && ss[<span>2</span>] == <span>0xBF</span><span>) </span><span>125</span> <span> { </span><span>126</span> reVal =<span> Encoding.UTF8; </span><span>127</span> <span> } </span><span>128</span> <span>else</span> <span>129</span> <span> { </span><span>130</span> <span>int</span><span> i; </span><span>131</span> <span>int</span>.TryParse(fs.Length.ToString(), <span>out</span><span> i); </span><span>132</span> ss =<span> r.ReadBytes(i); </span><span>133</span> <span>134</span> <span>if</span><span> (IsUTF8Bytes(ss)) </span><span>135</span> reVal =<span> Encoding.UTF8; </span><span>136</span> <span> } </span><span>137</span> <span> } </span><span>138</span> <span> r.Close(); </span><span>139</span> <span>return</span><span> reVal; </span><span>140</span> <span>141</span> <span> } </span><span>142</span> <span>143</span> <span>///</span> <span><summary></span> <span>144</span> <span>///</span><span> 判断是否是不带 BOM 的 UTF8 格式 </span><span>145</span> <span>///</span> <span></summary></span> <span>146</span> <span>///</span> <span><param name="data"></param></span> <span>147</span> <span>///</span> <span><returns></returns></span> <span>148</span> <span>private</span> <span>static</span> <span>bool</span> IsUTF8Bytes(<span>byte</span><span>[] data) </span><span>149</span> <span> { </span><span>150</span> <span>int</span> charByteCounter = <span>1</span>; <span>//</span><span>计算当前正分析的字符应还有的字节数 </span> <span>151</span> <span>byte</span> curByte; <span>//</span><span>当前分析的字节. </span> <span>152</span> <span>for</span> (<span>int</span> i = <span>0</span>; i < data.Length; i++<span>) </span><span>153</span> <span> { </span><span>154</span> curByte =<span> data[i]; </span><span>155</span> <span>if</span> (charByteCounter == <span>1</span><span>) </span><span>156</span> <span> { </span><span>157</span> <span>if</span> (curByte >= <span>0x80</span><span>) </span><span>158</span> <span> { </span><span>159</span> <span>//</span><span>判断当前 </span> <span>160</span> <span>while</span> (((curByte <<= <span>1</span>) & <span>0x80</span>) != <span>0</span><span>) </span><span>161</span> <span> { </span><span>162</span> charByteCounter++<span>; </span><span>163</span> <span> } </span><span>164</span> <span>//</span><span>标记位首位若为非0 则至少以2个1开始 如:110XXXXX...........1111110X </span> <span>165</span> <span>if</span> (charByteCounter == <span>1</span> || charByteCounter > <span>6</span><span>) </span><span>166</span> <span> { </span><span>167</span> <span>return</span> <span>false</span><span>; </span><span>168</span> <span> } </span><span>169</span> <span> } </span><span>170</span> <span> } </span><span>171</span> <span>else</span> <span>172</span> <span> { </span><span>173</span> <span>//</span><span>若是UTF-8 此时第一位必须为1 </span> <span>174</span> <span>if</span> ((curByte & <span>0xC0</span>) != <span>0x80</span><span>) </span><span>175</span> <span> { </span><span>176</span> <span>return</span> <span>false</span><span>; </span><span>177</span> <span> } </span><span>178</span> charByteCounter--<span>; </span><span>179</span> <span> } </span><span>180</span> <span> } </span><span>181</span> <span>if</span> (charByteCounter > <span>1</span><span>) </span><span>182</span> <span> { </span><span>183</span> <span>throw</span> <span>new</span> Exception(<span>"</span><span>非预期的byte格式!</span><span>"</span><span>); </span><span>184</span> <span> } </span><span>185</span> <span>return</span> <span>true</span><span>; </span><span>186</span> <span> } </span><span>187</span> }
|