Thursday, July 22, 2010

EncodingTool Class: Detects, Checks for Consistency, and Converts Encoding - C#

Download the class here.
The following class can identify encoding in a file, check a file for consistent encoding, and convert encoding in a file.
class EncodingTool
    {
        //@Return: Returns the name of the encoding of the file at filePath
        public static string GetFileEncoding(string filePath)
        {
            FileStream file = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read);
            string encoding = null;

            try
            {

                if (file.CanSeek) //if file is readable
                {
                    byte[] bom = new byte[4]; //getting Byte-order mark

                    file.Read(bom, 0, 4);

                    if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)                     //utf-8
                        encoding = "UTF-8";
                    else if (bom[0] == 0xff && bom[1] == 0xfe)                                  // ucs-2le, ucs-4le, and ucs-16le
                        encoding = "UCS-21e, UCS-41e, and UCS-161e";
                    else if (bom[0] == 0xfe && bom[1] == 0xff)                                  // utf-16 and ucs-2
                        encoding = "UTF-16 and UCS-2";
                    else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff)    //ucs-4
                        encoding = "UCS-4";
                    else                                                                        //DEFAULT: ASCII
                        encoding = "ASCII";
                }

            }
            catch (Exception e)
            { Console.Error.WriteLine("ERROR: " + e.Message); }
            finally
            {
                if (file != null)
                    file.Close();
            }

            return encoding;
        }

        //@Return: Returns true if encoding is consistant throughout the file, else print all encodings
        public static bool ConsistantEncoding(string filePath)
        {
            FileStream file = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read);
            string encoding = null;
            bool consistant = true;

            try
            {

                if (file.CanSeek) //if file is readable
                {

                    byte[] bom = new byte[4]; //getting Byte-order mark

                    while (file.Read(bom, 0, 4) != 0)
                    {
                        if (encoding == null)
                        {
                            if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)                     //utf-8
                                encoding = "UTF-8";
                            else if (bom[0] == 0xff && bom[1] == 0xfe)                                  // ucs-2le, ucs-4le, and ucs-16le
                                encoding = "UCS-21e, UCS-41e, and UCS-161e";
                            else if (bom[0] == 0xfe && bom[1] == 0xff)                                  // utf-16 and ucs-2
                                encoding = "UTF-16 and UCS-2";
                            else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff)    //ucs-4
                                encoding = "UCS-4";
                            else                                                                        //DEFAULT: ASCII
                                encoding = "ASCII";
                        }
                        else
                        {
                            if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)                     //utf-8
                            {
                                if (encoding != "UTF-8")
                                    consistant = false;
                            }
                            else if (bom[0] == 0xff && bom[1] == 0xfe)                                  // ucs-2le, ucs-4le, and ucs-16le
                                if (encoding != "UCS-21e, UCS-41e, and UCS-161e")
                                    consistant = false;
                                else if (bom[0] == 0xfe && bom[1] == 0xff)                                  // utf-16 and ucs-2
                                    if (encoding != "UTF-16 and UCS-2")
                                        consistant = false;
                                    else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff)    //ucs-4
                                        if (encoding != "UCS-4")
                                            consistant = false;
                                        else                                                                        //DEFAULT: ASCII
                                            if (encoding != "ASCII")
                                                consistant = false;
                        }
                    }
                }

            }
            catch (Exception e)
            { Console.Error.WriteLine("ERROR: " + e.Message); }
            finally
            {
                if (file != null)
                    file.Close();
            }

            return consistant;

        }

        //@Done: Converts a file to specified unicode and names it: Name_Encoding_.txt
        public static void ConvertEncoding(string filePath, Encoding enc)
        {
            StreamReader sr = new StreamReader(filePath);
            StreamWriter sw = new StreamWriter(filePath.Substring(0, filePath.Length - 4) + "_" + enc.EncodingName + ".txt", true, enc);

            string line;
            using (sw)
            {
                while ((line = sr.ReadLine()) != null)
                {
                    sw.WriteLine(line);
                }
            }
        }
    }

Download the class here.

0 comments:

Post a Comment