The following class can identify encoding in a file, check a file for consistent encoding, and convert encoding in a file.
class EncodingTool
{
//@Return: Returns the name of the encoding of the file at filePath
public static string GetFileEncoding(string filePath)
{
FileStream file = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read);
string encoding = null;
try
{
if (file.CanSeek) //if file is readable
{
byte[] bom = new byte[4]; //getting Byte-order mark
file.Read(bom, 0, 4);
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) //utf-8
encoding = "UTF-8";
else if (bom[0] == 0xff && bom[1] == 0xfe) // ucs-2le, ucs-4le, and ucs-16le
encoding = "UCS-21e, UCS-41e, and UCS-161e";
else if (bom[0] == 0xfe && bom[1] == 0xff) // utf-16 and ucs-2
encoding = "UTF-16 and UCS-2";
else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) //ucs-4
encoding = "UCS-4";
else //DEFAULT: ASCII
encoding = "ASCII";
}
}
catch (Exception e)
{ Console.Error.WriteLine("ERROR: " + e.Message); }
finally
{
if (file != null)
file.Close();
}
return encoding;
}
//@Return: Returns true if encoding is consistant throughout the file, else print all encodings
public static bool ConsistantEncoding(string filePath)
{
FileStream file = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read);
string encoding = null;
bool consistant = true;
try
{
if (file.CanSeek) //if file is readable
{
byte[] bom = new byte[4]; //getting Byte-order mark
while (file.Read(bom, 0, 4) != 0)
{
if (encoding == null)
{
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) //utf-8
encoding = "UTF-8";
else if (bom[0] == 0xff && bom[1] == 0xfe) // ucs-2le, ucs-4le, and ucs-16le
encoding = "UCS-21e, UCS-41e, and UCS-161e";
else if (bom[0] == 0xfe && bom[1] == 0xff) // utf-16 and ucs-2
encoding = "UTF-16 and UCS-2";
else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) //ucs-4
encoding = "UCS-4";
else //DEFAULT: ASCII
encoding = "ASCII";
}
else
{
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) //utf-8
{
if (encoding != "UTF-8")
consistant = false;
}
else if (bom[0] == 0xff && bom[1] == 0xfe) // ucs-2le, ucs-4le, and ucs-16le
if (encoding != "UCS-21e, UCS-41e, and UCS-161e")
consistant = false;
else if (bom[0] == 0xfe && bom[1] == 0xff) // utf-16 and ucs-2
if (encoding != "UTF-16 and UCS-2")
consistant = false;
else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) //ucs-4
if (encoding != "UCS-4")
consistant = false;
else //DEFAULT: ASCII
if (encoding != "ASCII")
consistant = false;
}
}
}
}
catch (Exception e)
{ Console.Error.WriteLine("ERROR: " + e.Message); }
finally
{
if (file != null)
file.Close();
}
return consistant;
}
//@Done: Converts a file to specified unicode and names it: Name_Encoding_.txt
public static void ConvertEncoding(string filePath, Encoding enc)
{
StreamReader sr = new StreamReader(filePath);
StreamWriter sw = new StreamWriter(filePath.Substring(0, filePath.Length - 4) + "_" + enc.EncodingName + ".txt", true, enc);
string line;
using (sw)
{
while ((line = sr.ReadLine()) != null)
{
sw.WriteLine(line);
}
}
}
}
Download the class here.

0 comments:
Post a Comment