-- steve_howard_tsqlchallenge_14.sql /* 10/2/2009 - Steve Howard * TSQL Challenge #14 - Identify the longest sequence of characters in a string * http://beyondrelational.com/blogs/tc/archive/2009/09/28/tsql-challenge-1... * * Strategy: This solution uses 3 CTE's. First, the recursive (Streaks) CTE * builds up a list of all streaks found in each data. This includes * streaks of length 1 and substreaks of longer streaks, necessitating the * second (GroupedStreaks) CTE. * * The second (GroupedStreaks) CTE takes the data from the Streaks CTE and * removes length 1 streaks and substreaks. The GroupedStreaks CTE provides * all the data necessary for the final result set, however, it does not * include enough information to properly sort it. (Sorting is first by * Data with longest streak, then by Data with the most streaks, then by * start Pos within each data.) * * The third (StreakStats) CTE finds the longest streak and the total count * of streaks for each Data. Joining this to the GroupedStreaks CTE on Data * allows proper sorting of the result set. */ --Sample Data DECLARE @t TABLE (Data VARCHAR(40) ) INSERT @t (Data) SELECT '9992EDC6-D117-4DEE-B410-4E5FAE46AE97' INSERT @t (Data) SELECT '0BFC936B-BD9A-4C6A-AFB2-CF3F1752F8B1' INSERT @t (Data) SELECT '4A73E7EB-7777-4A04-9258-F1E75097977C' INSERT @t (Data) SELECT '5AAF477C-274D-400D-9067-035968F33B19' INSERT @t (Data) SELECT '725DA718-30D0-44A9-B36A-89F27CDFEEDE' INSERT @t (Data) SELECT '8083ED5A-D3B9-4694-BB04-F0B09C588888' --SELECT * FROM @t --Recursive CTE to generate all streaks of the same letter/digit --4 Columns: -- Data = the source Data element being looked at from @t -- [Char] = the actual letter/digit for the current streak -- Pos = the position (index) within Data at which the current streak began -- [Len] = the length of the streak (number of same letter/digit in a row) -- --Initial Conditions: Start with the first letter of each Data, starting with --Pos = 1 and [Len] = 1. -- --Each Recursion: Compare the next character from Data (at index Pos+[Len]) --to the character of the current streak. If the character matches, extend --the length of the streak by 1. If it doesn't match, start a new streak for --the new character with [Len] = 1 and Pos = the current position, Pos+[Len]. -- --End condition: Terminate when the current position, Pos+[Len], is greater --than the length of the source Data. ;WITH Streaks (Data, [Char], Pos, [Len]) AS ( SELECT Data, LEFT(Data, 1), 1, 1 FROM @t UNION ALL SELECT Data, SUBSTRING(Data, Pos+[Len], 1), CASE WHEN SUBSTRING(Data, Pos+[Len], 1) = [Char] THEN Pos ELSE Pos + [Len] END, CASE WHEN SUBSTRING(Data, Pos+[Len], 1) = [Char] THEN [Len] + 1 ELSE 1 END FROM Streaks WHERE Pos+[Len] <= LEN(Data) ), --GroupedStreaks CTE filters out streaks of Len = 1 and removes substreaks for --the same Data/[Char]/Pos by grouping them and keeping the MAX([Len]). GroupedStreaks (Data, [Char], Pos, Len) AS ( SELECT Data, [Char], Pos, MAX([Len]) FROM Streaks WHERE [Len] > 1 GROUP BY Data, [Char], Pos ), --StreakStats CTE finds the longest streak and total count of streaks for each --Data from GroupedStreaks. This is used for ordering the final result set. StreakStats (Data, LongestStreak, StreakCount) AS ( SELECT Data, MAX([Len]), COUNT([Len]) FROM GroupedStreaks GROUP BY Data ) --Final SELECT for result set SELECT GS.Data, GS.[Char], GS.Pos, GS.[Len] FROM GroupedStreaks GS INNER JOIN StreakStats SS ON GS.Data = SS.Data ORDER BY SS.LongestStreak DESC, SS.StreakCount DESC, GS.Pos