Thursday, July 22, 2010

Detecting Encoding of a File - C#

The following method reads the byte-order mark of a file and returns a string representing the encoding type.

Special Thanks to Heath Stewart - Your tutorial helped me understand Encoding better and allowed me to write this method. Check out his tutorial here.

//@Return: Returns the name of the encoding of the file at filePath
        public static string GetFileEncoding(string filePath)
        {
            FileStream file = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read);
            string encoding = null;

            try
            {
                
                if (file.CanSeek) //if file is readable
                {
                    byte[] bom = new byte[4]; //getting Byte-order mark

                    file.Read(bom, 0, 4);

                    if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)                     //utf-8
                        encoding = "UTF-8";
                    else if (bom[0] == 0xff && bom[1] == 0xfe)                                  // ucs-2le, ucs-4le, and ucs-16le
                        encoding = "UCS-21e, UCS-41e, and UCS-161e";
                    else if (bom[0] == 0xfe && bom[1] == 0xff)                                  // utf-16 and ucs-2
                        encoding = "UTF-16 and UCS-2";
                    else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff)    //ucs-4
                        encoding = "UCS-4";
                    else                                                                        //DEFAULT: ASCII
                        encoding = "ASCII";
                }

            }
            catch (Exception e)
            { Console.Error.WriteLine("ERROR: " + e.Message); }
            finally
            {
                if (file != null)
                    file.Close();
            }

            return encoding;
        }

0 comments:

Post a Comment