Tuesday, April 14, 2015

Clean Every SQL Table Column of Unwanted Data

Here is a simple T-SQL snippet for Microsoft SQL Server 2005 (and probably other editions) that you can use to clean up the strings in the columns of your table by removing extra spaces, tabs, carriage returns, and newlines. This is useful when you have imported data from somewhere and need to do some cleaning on it...

/***************************************************************

DESCRIPTION:  This TSQL Code snippet will perform data cleaning on every varchar
column and nvarchar column in the specified table, except for varchar(max) and
nvarchar(max) columns.

PURPOSE:  This is useful for removing extra spaces, tabs, carriage returns, new lines, and
trimming the front and end of the strings in all your columns. Add other cleaning
routines if you would like, or combine multiple cleaning activities in one statement.

WARNING:  DATA LOSS WILL LIKELY RESULT
    Every [n]varchar column with a length less than max will be affected.

RECOMMENDED:  Test on a COPY of the desired table before use to determine data loss.



***************************************************************/


declare @sql as varchar(2000)
declare @Table as varchar(250)
declare @Column as varchar(250)
declare @Count as int
declare @x as int

set @Table = 'Authors'         --  ENTER YOUR TABLE NAME HERE
set @Column = ''
set @Count = 0
set @x = 1


-- delete the temp table if it exists
IF OBJECT_ID(N'tempdb..#aTempTable') IS NOT NULL drop table #aTempTable


-- get valid column names in a temp table
SELECT
     row_number() over (order by ordinal_position) as ROW
     ,COLUMN_NAME
into #aTempTable
FROM INFORMATION_SCHEMA.COLUMNS
WHERE
     data_type like '%varchar'            /* find all varchar and nvarchar columns */
     and table_name like @Table
     and character_maximum_length > 0    /* avoid varchar(max) columns which are -1 character_maximum_length */


-- how many columns are in the table?
set @Count = (select max(row) from #aTempTable)


while @x <= @Count          -- Loop through the column names, and perform data cleaning
begin

     -- get the column name for this iteration
     set @Column = (select column_name from #aTempTable where row = @x)

     -- notify user what is happening (on the messages tab in SQL Server Managment Studio)
     print ''
     print ''
     print cast(@x as varchar(20)) + '.  Cleaning ' + @Column



     -- perform various data cleaning activities on each column of the data in a given table


     -- replace tabs, carriage returns, and newlines with spaces
     set @sql = 'update ' + @Table + ' set [' + @Column + '] = replace(replace(replace([' + @Column +'],char(10),'' ''), char(13), '' ''),char(9),'' '')'
     --select (@sql)
     exec (@sql)


     -- replace two spaces with one
     set @sql = 'update ' + @Table + ' set [' + @Column + '] = replace(replace([' + @Column +'],''  '','' ''), ''  '', '' '')'
     --select (@sql)
     exec (@sql)


     -- trim everything, removing beginning and trailing spaces
     set @sql = 'update ' + @Table + ' set [' + @Column + '] = ltrim(rtrim([' + @Column +']))'
     --select (@sql)
     exec (@sql)



     -- add your specialized cleaning code here...




     -- go to next column name
     set @x = @x + 1



end


drop table #aTempTable

go


/* Sample Output -- that would be run by the exec (@sql) statements

update Author set [AuthorName] = replace(replace([AuthorName],char(10),' '), char(13), ' ')

*/




No comments:

Post a Comment