I write a string processor class.You can use it.
Example:
metaKeywords = bodyText.Process(prepositions).OrderByDescending().TakeTop().GetWords().AsString();
Class:
public static class StringProcessor
{
private static List<String> PrepositionList;
public static string ToNormalString(this string strText)
{
if (String.IsNullOrEmpty(strText)) return String.Empty;
char chNormalKaf = (char)1603;
char chNormalYah = (char)1610;
char chNonNormalKaf = (char)1705;
char chNonNormalYah = (char)1740;
string result = strText.Replace(chNonNormalKaf, chNormalKaf);
result = result.Replace(chNonNormalYah, chNormalYah);
return result;
}
public static List<KeyValuePair<String, Int32>> Process(this String bodyText,
List<String> blackListWords = null,
int minimumWordLength = 3,
char splitor = ' ',
bool perWordIsLowerCase = true)
{
string[] btArray = bodyText.ToNormalString().Split(splitor);
long numberOfWords = btArray.LongLength;
Dictionary<String, Int32> wordsDic = new Dictionary<String, Int32>(1);
foreach (string word in btArray)
{
if (word != null)
{
string lowerWord = word;
if (perWordIsLowerCase)
lowerWord = word.ToLower();
var normalWord = lowerWord.Replace(".", "").Replace("(", "").Replace(")", "")
.Replace("?", "").Replace("!", "").Replace(",", "")
.Replace("<br>", "").Replace(":", "").Replace(";", "")
.Replace("،", "").Replace("-", "").Replace("n", "").Trim();
if ((normalWord.Length > minimumWordLength && !normalWord.IsMemberOfBlackListWords(blackListWords)))
{
if (wordsDic.ContainsKey(normalWord))
{
var cnt = wordsDic[normalWord];
wordsDic[normalWord] = ++cnt;
}
else
{
wordsDic.Add(normalWord, 1);
}
}
}
}
List<KeyValuePair<String, Int32>> keywords = wordsDic.ToList();
return keywords;
}
public static List<KeyValuePair<String, Int32>> OrderByDescending(this List<KeyValuePair<String, Int32>> list, bool isBasedOnFrequency = true)
{
List<KeyValuePair<String, Int32>> result = null;
if (isBasedOnFrequency)
result = list.OrderByDescending(q => q.Value).ToList();
else
result = list.OrderByDescending(q => q.Key).ToList();
return result;
}
public static List<KeyValuePair<String, Int32>> TakeTop(this List<KeyValuePair<String, Int32>> list, Int32 n = 10)
{
List<KeyValuePair<String, Int32>> result = list.Take(n).ToList();
return result;
}
public static List<String> GetWords(this List<KeyValuePair<String, Int32>> list)
{
List<String> result = new List<String>();
foreach (var item in list)
{
result.Add(item.Key);
}
return result;
}
public static List<Int32> GetFrequency(this List<KeyValuePair<String, Int32>> list)
{
List<Int32> result = new List<Int32>();
foreach (var item in list)
{
result.Add(item.Value);
}
return result;
}
public static String AsString<T>(this List<T> list, string seprator = ", ")
{
String result = string.Empty;
foreach (var item in list)
{
result += string.Format("{0}{1}", item, seprator);
}
return result;
}
private static bool IsMemberOfBlackListWords(this String word, List<String> blackListWords)
{
bool result = false;
if (blackListWords == null) return false;
foreach (var w in blackListWords)
{
if (w.ToNormalString().Equals(word))
{
result = true;
break;
}
}
return result;
}
}
Krya_Krya 0 / 0 / 0 Регистрация: 14.10.2018 Сообщений: 23 |
||||
1 |
||||
20.12.2019, 21:33. Показов 5129. Ответов 3 Метки foreach, новичок, сишарп (Все метки)
Подскажите пожалуйста, как с помощью foreach разбить строку на подстроки? Знаю как со сплитом это сделать, но хочется узнать все варианты. Код со сплитом:
0 |
Diamante 3453 / 2461 / 1169 Регистрация: 14.08.2016 Сообщений: 8,153 |
||||
20.12.2019, 22:10 |
2 |
|||
Krya_Krya, ну примерно так
только зачем?
0 |
Enifan 1840 / 1182 / 501 Регистрация: 14.10.2018 Сообщений: 3,179 |
||||
20.12.2019, 22:15 |
3 |
|||
Krya_Krya, в данному случаи я буду использовать динамический массив (список List<>) а не статический массив (обычный string[]), так как заранее кол-во слов не известно. Конечно можно сделать подсчет и создать статический массив нужного размера — но на это надо дополнительный цикл делать.
1 |
350 / 245 / 76 Регистрация: 18.03.2016 Сообщений: 979 |
|
21.12.2019, 00:01 |
4 |
Enifan, … Krya_Krya, попробуй через регулярные выражения. Например то что делает Enifan:
1 |
Let’s set up code to benchmark different approaches. Every word counter will implement this interface:
interface IWordCounter
{
IDictionary<string, int> CountWords(string path);
}
And here’s our benchmark runner:
var wordCounters = new IWordCounter[]
{
// ...
};
foreach (var wordCounter in wordCounters)
{
GC.Collect();
GC.WaitForPendingFinalizers();
var sw = Stopwatch.StartNew();
var wordCount = wordCounter.CountWords(path);
sw.Stop();
Console.WriteLine("{0}, {1} entries, {2}", wordCounter.GetType().Name, wordCount.Count, sw.Elapsed);
}
Timings were taken with a release build, on the test file provided, no debugger attached, on .NET 4.5.2.
Here’s the original code:
class OriginalWordCounter : IWordCounter
{
private static readonly char[] separators = { ' ' };
public IDictionary<string, int> CountWords(string path)
{
var wordCount = new Dictionary<string, int>();
using (var fileStream = File.Open(path, FileMode.Open, FileAccess.Read))
using (var streamReader = new StreamReader(fileStream))
{
string line;
while ((line = streamReader.ReadLine()) != null)
{
var words = line.Split(separators, StringSplitOptions.RemoveEmptyEntries);
foreach (var word in words)
{
if (wordCount.ContainsKey(word))
{
wordCount[word] = wordCount[word] + 1;
}
else
{
wordCount.Add(word, 1);
}
}
}
}
return wordCount;
}
}
On my machine, this takes about 8.2s.
We see an improvement using Heslacher’s suggestion to use TryGet
:
class OriginalTryGetWordCounter : IWordCounter
{
private static readonly char[] separators = { ' ' };
public IDictionary<string, int> CountWords(string path)
{
var wordCount = new Dictionary<string, int>();
foreach (var line in File.ReadLines(path, Encoding.UTF8))
{
var words = line.Split(separators, StringSplitOptions.RemoveEmptyEntries);
foreach (var word in words)
{
int count;
wordCount.TryGetValue(word, out count);
wordCount[word] = count + 1;
}
}
return wordCount;
}
}
This takes about 6.7s. (The use of File.ReadLines
here doesn’t seem to effect the timing, it’s just a bit cleaner.)
We get another improvement with Parallel.ForEach
together with a ConcurrentDictionary
:
class ParallelWordCounter : IWordCounter
{
public IDictionary<string, int> CountWords(string path)
{
var result = new ConcurrentDictionary<string, int>();
Parallel.ForEach(File.ReadLines(path, Encoding.UTF8), line =>
{
var words = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
foreach (var word in words)
{
result.AddOrUpdate(word, 1, (_, x) => x + 1);
}
});
return result;
}
}
This takes about 5.2s.
You might want to try some of the Parallel.Foreach
overloads to see if you can get any further improvements, and remember to take these results with a grain of salt.
Given a 2D grid of characters and a word/ multiple words, the task is to check if that word/words exist in the grid or not. A word can be matched in 4 directions at any point.
The 4 directions are Horizontally Left and Right, Vertically Up and Down.
Examples:
Input: grid[][] = {"axmy", "bgdf", "xeet", "raks"}; Output: Yes a x m y b g d f x e e t r a k s Input: grid[][] = {"axmy", "brdf", "xeet", "rass"}; Output : No
Source: Microsoft Interview
Approach when a single word is to be checked : The idea used here is described in the steps below:
- Check every cell, if the cell has the first character, then recur one by one and try all 4 directions from that cell for a match.
- Mark the position in the grid as visited and recur in the 4 possible directions.
- After recurring, again mark the position as unvisited.
- Once all the letters in the word are matched, return true.
Below is the implementation of the above approach:
C++
#include <bits/stdc++.h>
using
namespace
std;
#define r 4
#define c 5
bool
findmatch(
char
mat[r], string pat,
int
x,
int
y,
int
nrow,
int
ncol,
int
level)
{
int
l = pat.length();
if
(level == l)
return
true
;
if
(x < 0 || y < 0 || x >= nrow || y >= ncol)
return
false
;
if
(mat[x][y] == pat[level]) {
char
temp = mat[x][y];
mat[x][y] =
'#'
;
bool
res = findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |
findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |
findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |
findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);
mat[x][y] = temp;
return
res;
}
else
return
false
;
}
bool
checkMatch(
char
mat[r], string pat,
int
nrow,
int
ncol)
{
int
l = pat.length();
if
(l > nrow * ncol)
return
false
;
for
(
int
i = 0; i < nrow; i++) {
for
(
int
j = 0; j < ncol; j++) {
if
(mat[i][j] == pat[0])
if
(findmatch(mat, pat, i, j, nrow, ncol, 0))
return
true
;
}
}
return
false
;
}
int
main()
{
char
grid[r] = {
"axmy"
,
"bgdf"
,
"xeet"
,
"raks"
};
if
(checkMatch(grid,
"geeks"
, r, c))
cout <<
"Yes"
;
else
cout <<
"No"
;
return
0;
}
Java
class
GFG
{
static
final
int
r =
4
;
static
final
int
c =
4
;
static
boolean
findmatch(
char
mat[][], String pat,
int
x,
int
y,
int
nrow,
int
ncol,
int
level)
{
int
l = pat.length();
if
(level == l)
return
true
;
if
(x <
0
|| y <
0
|| x >= nrow || y >= ncol)
return
false
;
if
(mat[x][y] == pat.charAt(level))
{
char
temp = mat[x][y];
mat[x][y] =
'#'
;
boolean
res = findmatch(mat, pat, x -
1
, y, nrow, ncol, level +
1
) |
findmatch(mat, pat, x +
1
, y, nrow, ncol, level +
1
) |
findmatch(mat, pat, x, y -
1
, nrow, ncol, level +
1
) |
findmatch(mat, pat, x, y +
1
, nrow, ncol, level +
1
);
mat[x][y] = temp;
return
res;
}
else
return
false
;
}
static
boolean
checkMatch(
char
mat[][], String pat,
int
nrow,
int
ncol)
{
int
l = pat.length();
if
(l > nrow * ncol)
return
false
;
for
(
int
i =
0
; i < nrow; i++)
{
for
(
int
j =
0
; j < ncol; j++)
{
if
(mat[i][j] == pat.charAt(
0
))
if
(findmatch(mat, pat, i, j, nrow, ncol,
0
))
return
true
;
}
}
return
false
;
}
public
static
void
main(String[] args)
{
char
grid[][] = {
"axmy"
.toCharArray(),
"bgdf"
.toCharArray(),
"xeet"
.toCharArray(),
"raks"
.toCharArray() };
if
(checkMatch(grid,
"geeks"
, r, c))
System.out.print(
"Yes"
);
else
System.out.print(
"No"
);
}
}
Python3
r
=
4
c
=
4
def
findmatch(mat, pat, x, y,
nrow, ncol, level) :
l
=
len
(pat)
if
(level
=
=
l) :
return
True
if
(x <
0
or
y <
0
or
x >
=
nrow
or
y >
=
ncol) :
return
False
if
(mat[x][y]
=
=
pat[level]) :
temp
=
mat[x][y]
mat[x].replace(mat[x][y],
"#"
)
res
=
(findmatch(mat, pat, x
-
1
, y, nrow, ncol, level
+
1
) |
findmatch(mat, pat, x
+
1
, y, nrow, ncol, level
+
1
) |
findmatch(mat, pat, x, y
-
1
, nrow, ncol, level
+
1
) |
findmatch(mat, pat, x, y
+
1
, nrow, ncol, level
+
1
))
mat[x].replace(mat[x][y], temp)
return
res
else
:
return
False
def
checkMatch(mat, pat, nrow, ncol) :
l
=
len
(pat)
if
(l > nrow
*
ncol) :
return
False
for
i
in
range
(nrow) :
for
j
in
range
(ncol) :
if
(mat[i][j]
=
=
pat[
0
]) :
if
(findmatch(mat, pat, i, j,
nrow, ncol,
0
)) :
return
True
return
False
if
__name__
=
=
"__main__"
:
grid
=
[
"axmy"
,
"bgdf"
,
"xeet"
,
"raks"
]
if
(checkMatch(grid,
"geeks"
, r, c)) :
print
(
"Yes"
)
else
:
print
(
"No"
)
Javascript
<script>
let r = 4;
let c = 4;
function
findmatch(mat, pat, x, y, nrow, ncol, level)
{
let l = pat.length;
if
(level == l)
return
true
;
if
(x < 0 || y < 0 || x >= nrow || y >= ncol)
return
false
;
if
(mat[x][y] == pat[level])
{
let temp = mat[x][y];
mat[x][y] =
'#'
;
let res =
findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |
findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |
findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |
findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);
mat[x][y] = temp;
return
res;
}
else
return
false
;
}
function
checkMatch(mat, pat, nrow, ncol)
{
let l = pat.length;
if
(l > nrow * ncol)
return
false
;
for
(let i = 0; i < nrow; i++)
{
for
(let j = 0; j < ncol; j++)
{
if
(mat[i][j] == pat[0])
if
(findmatch(mat, pat, i, j, nrow, ncol, 0))
return
true
;
}
}
return
false
;
}
let grid = [
"axmy"
.split(
''
),
"bgdf"
.split(
''
),
"xeet"
.split(
''
),
"raks"
.split(
''
) ];
if
(checkMatch(grid,
"geeks"
, r, c))
document.write(
"Yes"
);
else
document.write(
"No"
);
</script>
C#
using
System;
class
GFG
{
static
readonly
int
r = 4;
static
readonly
int
c = 4;
static
bool
findmatch(
char
[,]mat, String pat,
int
x,
int
y,
int
nrow,
int
ncol,
int
level)
{
int
l = pat.Length;
if
(level == l)
return
true
;
if
(x < 0 || y < 0 || x >= nrow || y >= ncol)
return
false
;
if
(mat[x, y] == pat[level])
{
char
temp = mat[x, y];
mat[x, y] =
'#'
;
bool
res = findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |
findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |
findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |
findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);
mat[x, y] = temp;
return
res;
}
else
return
false
;
}
static
bool
checkMatch(
char
[,]mat, String pat,
int
nrow,
int
ncol)
{
int
l = pat.Length;
if
(l > nrow * ncol)
return
false
;
for
(
int
i = 0; i < nrow; i++)
{
for
(
int
j = 0; j < ncol; j++)
{
if
(mat[i, j] == pat[0])
if
(findmatch(mat, pat, i, j, nrow, ncol, 0))
return
true
;
}
}
return
false
;
}
public
static
void
Main(String[] args)
{
char
[,]grid = { {
'a'
,
'x'
,
'm'
,
'y'
},
{
'b'
,
'g'
,
'd'
,
'f'
},
{
'x'
,
'e'
,
'e'
,
't'
},
{
'r'
,
'a'
,
'k'
,
's'
} };
if
(checkMatch(grid,
"geeks"
, r, c))
Console.Write(
"Yes"
);
else
Console.Write(
"No"
);
}
}
Time Complexity: O(r*c), as we are using recursion to traverse the matrix. Where r and c are the rows and columns of the grid.
Auxiliary Space: O(r*c), as we are using extra space for the matrix. Where r and c are the rows and columns of the grid.
Approach when a group of words are to be checked : The idea used here is described in the steps below:
- iterate through group of words and check every cell, if the cell has the first character, then recur one by one and try all 4 directions from that cell for a match.
- Mark the position in the grid as visited and recur in the 4 possible directions.
- After recurring, again mark the position as unvisited.
- Once all the letters in the word are matched, return true, put it in answer list.
- return the answer list from a function and display
C++
#include <bits/stdc++.h>
using
namespace
std;
class
Solution {
public
:
vector<vector<
int
> > mover
= { { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, -1 } };
vector<vector<
char
> > board;
bool
dfs(
int
x,
int
y, string& s,
vector<vector<
bool
> > vis)
{
if
(s.length() == 0)
return
true
;
vis[x][y] =
true
;
bool
sol =
false
;
for
(
int
i = 0; i < mover.size(); i++) {
int
curr_x = mover[i][0] + x;
int
curr_y = mover[i][1] + y;
if
(curr_x >= 0 && curr_x < board.size()) {
if
(curr_y >= 0
&& curr_y < board[0].size()) {
if
(board[curr_x][curr_y] == s[0]
&& vis[curr_x][curr_y] ==
false
) {
string k = s.substr(
1);
sol |= dfs(curr_x, curr_y, k, vis);
}
}
}
}
return
sol;
}
vector<string> findWords(vector<vector<
char
> >& board,
vector<string>& words)
{
this
->board
= board;
vector<string> ans;
vector<vector<
bool
> > vis(
board.size(),
vector<
bool
>(board[0].size(),
false
));
for
(
auto
& word : words) {
for
(
int
i = 0; i < board.size(); i++) {
for
(
int
j = 0; j < board[i].size(); j++) {
if
(board[i][j] == word[0]) {
string s = word.substr(1);
if
(dfs(i, j, s, vis)) {
ans.push_back(word);
break
;
}
}
}
if
(ans.size() && ans.back() == word)
break
;
}
}
return
ans;
}
};
int
main()
{
Solution solver;
vector<vector<
char
> > board
= { {
'o'
,
'a'
,
'a'
,
'n'
},
{
'e'
,
't'
,
'a'
,
'e'
},
{
'i'
,
'h'
,
'k'
,
'r'
},
{
'i'
,
'f'
,
'l'
,
'v'
} };
vector<string> words = {
"oath"
,
"pea"
,
"eat"
,
"rain"
};
vector<string> ans = solver.findWords(board, words);
cout <<
"words present:n"
;
for
(
auto
& part : ans)
cout << part << endl;
return
0;
}
Java
import
java.util.*;
class
Solution {
int
[][] mover = {{
1
,
0
}, {
0
,
1
}, {-
1
,
0
}, {
0
, -
1
}};
char
[][] board;
public
boolean
dfs(
int
x,
int
y, String s,
boolean
[][] vis) {
if
(s.length() ==
0
) {
return
true
;
}
vis[x][y] =
true
;
boolean
sol =
false
;
for
(
int
i =
0
; i < mover.length; i++) {
int
curr_x = mover[i][
0
] + x;
int
curr_y = mover[i][
1
] + y;
if
(curr_x >=
0
&& curr_x < board.length) {
if
(curr_y >=
0
&& curr_y < board[
0
].length) {
if
(board[curr_x][curr_y] == s.charAt(
0
) && !vis[curr_x][curr_y]) {
String k = s.substring(
1
);
sol |= dfs(curr_x, curr_y, k, vis);
}
}
}
}
return
sol;
}
public
List<String> findWords(
char
[][] board, String[] words) {
this
.board = board;
List<String> ans =
new
ArrayList<>();
for
(String word : words) {
boolean
[][] vis =
new
boolean
[board.length][board[
0
].length];
for
(
int
i =
0
; i < board.length; i++) {
for
(
int
j =
0
; j < board[i].length; j++) {
if
(board[i][j] == word.charAt(
0
)) {
String s = word.substring(
1
);
if
(dfs(i, j, s, vis)) {
ans.add(word);
break
;
}
}
}
if
(!ans.isEmpty() && ans.get(ans.size() -
1
).equals(word)) {
break
;
}
}
}
return
ans;
}
}
class
Main {
public
static
void
main(String[] args) {
Solution solver =
new
Solution();
char
[][] board = { {
'o'
,
'a'
,
'a'
,
'n'
}, {
'e'
,
't'
,
'a'
,
'e'
}, {
'i'
,
'h'
,
'k'
,
'r'
}, {
'i'
,
'f'
,
'l'
,
'v'
} };
String[] words = {
"oath"
,
"pea"
,
"eat"
,
"rain"
};
System.out.println(
"Words present: "
);
List<String> arr = solver.findWords(board, words);
for
(
int
i =
0
; i < arr.size(); i++) {
System.out.println(arr.get(i));
}
}
}
Python3
class
Solution:
mover
=
[ [
1
,
0
], [
0
,
1
], [
-
1
,
0
], [
0
,
-
1
] ]
board
=
[]
def
dfs(
self
, x, y, s, vis):
if
len
(s)
=
=
0
:
return
True
vis[x][y]
=
True
sol
=
False
for
i
in
range
(
len
(
self
.mover)):
curr_x
=
self
.mover[i][
0
]
+
x
curr_y
=
self
.mover[i][
1
]
+
y
if
curr_x >
=
0
and
curr_x <
len
(
self
.board):
if
curr_y >
=
0
and
curr_y <
len
(
self
.board[
0
]):
if
self
.board[curr_x][curr_y]
=
=
s[
0
]
and
vis[curr_x][curr_y]
=
=
False
:
k
=
s[
1
:]
sol |
=
self
.dfs(curr_x, curr_y, k, vis)
return
sol
def
findWords(
self
, board, words):
self
.board
=
board
ans
=
[]
for
word
in
words:
vis
=
[[
False
for
_
in
range
(
len
(board[
0
]))]
for
_
in
range
(
len
(board))]
for
i
in
range
(
len
(board)):
for
j
in
range
(
len
(board[i])):
if
board[i][j]
=
=
word[
0
]:
s
=
word[
1
:]
if
self
.dfs(i, j, s, vis):
ans.append(word)
break
if
ans
and
ans[
-
1
]
=
=
word:
break
return
ans
solver
=
Solution()
board
=
[ [
'o'
,
'a'
,
'a'
,
'n'
], [
'e'
,
't'
,
'a'
,
'e'
], [
'i'
,
'h'
,
'k'
,
'r'
], [
'i'
,
'f'
,
'l'
,
'v'
] ]
words
=
[
"oath"
,
"pea"
,
"eat"
,
"rain"
]
print
(
"Words present: "
)
arr
=
solver.findWords(board, words)
for
i
in
range
(
len
(arr)):
print
(arr[i])
Javascript
class Solution {
mover = [ [1, 0], [0, 1], [-1, 0], [0, -1] ];
board = [];
dfs(x, y, s, vis) {
if
(s.length === 0) {
return
true
;
}
vis[x][y] =
true
;
let sol =
false
;
for
(let i = 0; i <
this
.mover.length; i++) {
let curr_x =
this
.mover[i][0] + x;
let curr_y =
this
.mover[i][1] + y;
if
(curr_x >= 0 && curr_x <
this
.board.length) {
if
(curr_y >= 0 && curr_y <
this
.board[0].length) {
if
(
this
.board[curr_x][curr_y] == s[0] && vis[curr_x][curr_y] ==
false
) {
let k = s.substring(1);
sol |=
this
.dfs(curr_x, curr_y, k, vis);
}
}
}
}
return
sol;
}
findWords(board, words) {
this
.board = board;
let ans = [];
for
(let word of words) {
let vis =
new
Array(board.length).fill(
false
).map(() =>
new
Array(board[0].length).fill(
false
));
for
(let i = 0; i < board.length; i++) {
for
(let j = 0; j < board[i].length; j++) {
if
(board[i][j] == word[0]) {
let s = word.substring(1);
if
(
this
.dfs(i, j, s, vis)) {
ans.push(word);
break
;
}
}
}
if
(ans.length && ans[ans.length - 1] == word) {
break
;
}
}
}
return
ans;
}
}
let solver =
new
Solution();
let board = [
['o
','
a
','
a
','
n
'],
['
e
','
t
','
a
','
e
'],
['
i
','
h
','
k
','
r
'],
['
i
','
f
','
l
','
v']
];
let words = [
"oath"
,
"pea"
,
"eat"
,
"rain"
];
const foundWords = solver.findWords(board, words);
console.log(
"Words present:"
);
foundWords.forEach(word => console.log(word));
C#
using
System;
using
System.Collections.Generic;
using
System.Linq;
class
Solution {
private
static
readonly
int
[][] Moves = {
new
[] {1, 0},
new
[] {0, 1},
new
[] {-1, 0},
new
[] {0, -1}
};
private
char
[][] _board;
public
IList<
string
> FindWords(
char
[][] board,
string
[] words) {
_board = board;
var
result =
new
List<
string
>();
foreach
(
var
word
in
words) {
if
(Exists(word)) {
result.Add(word);
}
}
return
result;
}
private
bool
Exists(
string
word) {
for
(
var
i = 0; i < _board.Length; i++) {
for
(
var
j = 0; j < _board[0].Length; j++) {
if
(_board[i][j] == word[0] && Search(word, i, j, 0)) {
return
true
;
}
}
}
return
false
;
}
private
bool
Search(
string
word,
int
i,
int
j,
int
k) {
if
(k == word.Length) {
return
true
;
}
if
(i < 0 || j < 0 || i == _board.Length || j == _board[0].Length) {
return
false
;
}
if
(_board[i][j] != word[k]) {
return
false
;
}
var
temp = _board[i][j];
_board[i][j] =
'#'
;
foreach
(
var
move
in
Moves) {
if
(Search(word, i + move[0], j + move[1], k + 1)) {
_board[i][j] = temp;
return
true
;
}
}
_board[i][j] = temp;
return
false
;
}
}
public
class
Program {
public
static
void
Main() {
var
solver =
new
Solution();
var
board =
new
char
[][] {
new
char
[] {
'o'
,
'a'
,
'a'
,
'n'
},
new
char
[] {
'e'
,
't'
,
'a'
,
'e'
},
new
char
[] {
'i'
,
'h'
,
'k'
,
'r'
},
new
char
[] {
'i'
,
'f'
,
'l'
,
'v'
}
};
var
words =
new
string
[] {
"oath"
,
"pea"
,
"eat"
,
"rain"
};
var
ans = solver.FindWords(board, words);
Console.WriteLine(
"Words present:"
);
foreach
(
var
word
in
ans) {
Console.WriteLine(word);
}
}
}
Output
words present: oath eat
Time Complexity: O(r*c*len(words)*number of words), as we are using recursion to traverse the matrix. Where r and c are the rows and columns of the grid.
Auxiliary Space: O(r*c*number of words), as we are using extra space for the matrix. Where r and c are the rows and columns of the grid.
Word breakers split longer words (e. g. compound nouns or compounds) into meaningful constituents. They are used with search engines that do not support infix search. While there aren’t many compounds in spoken English the problem is more immanent in languages like German or Swedish. Take for example the notorious “Donaudampfschiffahrtsgesellschaftskapitänsmütze”: A word that is made up from at least seven different words, depending on how you count. The problem is how to split possible compounds into their constituents in an efficient way.
This problem arose from a specific limitation of the Microsoft SQL Server 2008 we were using for searching sales item texts in a project. Its full text search engine doesn’t support infix search, only prefix search. This means that, for example, the word “Wildrahmsauce” (venison cream sauce) would not be matched by the query words “Rahm” or “Sauce”. However, this was what our users wanted.
Although SQL Server 2008 includes a word breaker for the German language it is not dictionary-based but rule-based. As such it fails to recognize the components of “Wildrahmsauce”. Moreover we would like to store item texts in three languages (German, French, Italian) in the same table so we would have to make a distinction on a per-record basis, something a SQL server word breaker can’t do. Unfortunately, they can produce other strange side effects as well such as not matching certain prefixes (!), so we had to disable language features entirely. You can apparently implement a DLL and register it as a word breaker with SQL Server, however, we felt that this approach would be rather time consuming and perhaps not robust enough.
A SQL query using LIKE on our approximately 50’000 records proved to be too slow. Holding the complete article list in an indexed structure in RAM or using a separate search component was also not an option. To be able to use SQL Server’s fulltext engine, we needed a way to enrich our search terms with the missing keywords. This meant that we had to implement a word breaker to recognize the words “Rahm” and “Sauce” within “Wildrahmsauce”. Note that the word “Wild” is not required to be a separately indexed word because SQL Server can do prefix matching already.
The first step is to decide on a useful dictionary of possible constituents. Obviously the size of the dictionary affects the performance of the splitting algorithm. It is therefore good practice to use a dictionary of minimal size. In our case we are using the words in the article texts themselves. This is based on the observation that the words “Rahm” and “Sauce” occur as separate words in other article texts, and the same is true for many other words that also appear as constituents of compounds. There are very few cases of constituents that appear only in compounds; these obviously cannot be detected automatically. We add them manually when our users require it.
It is also important that the minimum length of the dictionary words should not be too small. In this case we decided on a minimum length of 3 characters. A smaller limit would lead to many insignificant additional parts while providing little benefit for actual searches that typically contain three characters or more per search keyword.
The word breaker algorithm itself consists of two steps. First, an index is built from the dictionary words. Second, the words that require splitting are matched against this index to determine their constituents.
The implementation of the word breaker I present here is quite naïve. It doesn’t claim to be particularly efficient in respect to both time and space. However, it is easy to understand, quick to implement and fast enough for our purposes.
All search terms and dictionary words are normalized before using them, i. e. we
- replace non-alphanumeric characters by blanks,
- replace sequences of whitespace characters with one blank,
- replace accented characters and substitute umlauts (e. g. ‘ä’) with their non-umlaut equivalents (e. g. ‘ae’),
- trim the remaining string and
- capitalize it.
An example normalization function is shown here. The same function should be used to normalize query input as well before performing the actual search.
/// <summary> /// Normalize string (remove special chars, replace accented characters, /// substitute/expand umlauts, remove whitespace) /// </summary> /// <param name="input"></param> /// <returns></returns> public static string NormalizeString(string input) { // Replace special chars with blanks var splCharKiller = new Regex(@"[^A-Z0-9ÁÀÂÄÉÈÊËÍÌÎÏÓÒÔÖÚÙÛÜçß]"); var r1 = splCharKiller.Replace(input.ToUpper().Trim(), @" "); // Replace multiple whitespace chars with one var whitespaceKiller = new Regex(@"s{2,}"); var result = new StringBuilder(whitespaceKiller.Replace(r1, @" ")); // Replace or expand accented characters and umlauts result.Replace('Á', 'A'); result.Replace('À', 'A'); result.Replace('Â', 'A'); result.Replace(@"Ä", @"AE"); result.Replace('É', 'E'); result.Replace('È', 'E'); result.Replace('Ê', 'E'); result.Replace('Ë', 'E'); result.Replace('Í', 'I'); result.Replace('Ì', 'I'); result.Replace('Î', 'I'); result.Replace('Ï', 'I'); result.Replace('Ó', 'O'); result.Replace('Ò', 'O'); result.Replace('Ô', 'O'); result.Replace(@"Ö", @"OE"); result.Replace('Ú', 'U'); result.Replace('Ù', 'U'); result.Replace('Û', 'U'); result.Replace(@"Ü", @"UE"); result.Replace('ç', 'C'); result.Replace(@"ß", @"SS"); return result.ToString().Trim(); }
A very simple (and inefficient) solution of the word breaker problem in pseudo code is:
for each compound { for each possible_word { if (compound.Contains(possible_word)) { results.Add(possible_word) } } }
It’s easy to see that the worst case runtime is O(n * m) where n is the number of compounds and m is the number of possible words. This is far too inefficient for practical purposes, as m almost equals n in our case (remember, we are taking the words from the article texts as possible constituents). The fairly naïve approach we are taking here is to reduce m; while this doesn’t reduce the worst case time complexity of the algorithm we can gain a great speed improvement in practice.
Thus, a refined solution might look like this:
for each compound { for each character except the first one { get possible_words starting with this character for each possible_word { if (compound contains the word at the character’s position) { results.Add(possible_word) } } } }
What we do here is reduce the list of possible words by testing only against those words that start with the character we are currently looking at. Getting the list using a hash table can be done in O(1) assuming the hash has been built beforehand. Building the hash is done in linear time, i.e. O(m).
Note that the introduction of the character loop doesn’t introduce an additional factor into the time complexity formula. It just makes the Contains operation of the simple solution explicit, and we can assume that string length is constant for the purposes of this comparison (string length is limited to 255 characters by definition).
However, the speed gain is not yet big enough. One improvement we can still make is to take only those possible words into account that are equal in length or shorter than the rest of the remaining possible compound. Another improvement is to regard two leading characters instead of one. These two conditions greatly reduce the number of words we have to try against the compound.
The resulting data structure consists of word lists indexed by two properties: first, the length of the words, and second, the first two characters of the words. To improve speed the indexes are implemented as dictionaries. In C#, the type is essentially
Dictionary<int, Dictionary<string, List<string>>>
where the int is the word length, the string is the first two characters and the lists contain the suitable words. The Dictionary<string, List<string>> part is implemented as its own class TwoCharIndex, mainly to improve readability.
Here is the code for the word breaker. You can freely use it in commercial or non-commercial projects and modify it according to your needs.
using System; using System.Collections.Generic; namespace StringTools { /// <summary> /// Index strings by their first two characters. /// </summary> class TwoCharIndex : Dictionary<string, List<string>> { /// <summary> /// Inserts the word into the index. Assumes it has been case normalized. /// </summary> /// <param name="word"></param> public void Insert(string word) { if (word.Length < 2) throw new ArgumentException(@"word length must be at least 2"); List<string> wordList; if (!TryGetValue(word.Substring(0, 2), out wordList)) { wordList = new List<string>(); this[word.Substring(0, 2)] = wordList; } wordList.Add(word); } public bool Find(string word) { if (word.Length < 2) throw new ArgumentException(@"word length must be at least 2"); List<string> wordList; if (TryGetValue(word.Substring(0, 2), out wordList)) { return wordList.Contains(word); } return false; } } /// <summary> /// Builds an index based on a given dictionary and provides a Break method to split longer words /// into constituents from this dictionary. /// </summary> public class WordBreakerIndex { readonly Dictionary<int, TwoCharIndex> wordIndexes = new Dictionary<int, TwoCharIndex>(); private const int MinLength = 3; private readonly int minLength; /// <summary> /// Create and build the word breaker index based on the dictionary of given words. /// </summary> /// <param name="words">The dictionary of words that are used for breaking</param> /// <param name="aMinLength">The minimum length of words that are used (default = 3)</param> public WordBreakerIndex(IEnumerable<string> words, int aMinLength = MinLength) { minLength = aMinLength; // iterate over words foreach (var word in words) { // take only longer words if (word.Length < minLength) continue; // get or create index TwoCharIndex wordIndex; if (!wordIndexes.TryGetValue(word.Length, out wordIndex)) { // create a new index for words of this length wordIndex = new TwoCharIndex(); wordIndexes[word.Length] = wordIndex; } // insert the word into the index wordIndex.Insert(word); } } /// <summary> /// Attempt to split the word into smaller ones that the word does not begin with (infix or suffix words). /// </summary> /// <param name="word"></param> /// <returns></returns> public IEnumerable<string> Break(string word) { var result = new HashSet<string>(); // Iterate over the word's characters starting from the second one, because // the first character (or word component) would be matched by prefix search anyway. var i = 1; while (true) { var sufLength = word.Length - i; if (sufLength < minLength) break; // try to match the remaining lengths for (var l = sufLength; l >= minLength; l--) { var wordPart = word.Substring(i, l); // get index for this length TwoCharIndex wordIndex; if (wordIndexes.TryGetValue(l, out wordIndex)) { // there are words with this length if (wordIndex.Find(wordPart)) result.Add(wordPart); } } i++; } return result; } public int GetWordCount() { var result = 0; foreach (var tci in wordIndexes.Values) foreach (var twi in tci.Values) result += twi.Count; return result; } } }
And here’s some statistics for our input data (in three languages):
- Total records: 47999
- Total characters: 3’767’975
- Total words (non-distinct, includes one- and two-letter words): 614081
- Characters per record: min 25, avg 78.5, max 118
- Words per record: min 2, avg 12.8, max 24
- Number of unique words in dictionary index (word length > 2): 95210
- Time to build dictionary indexes and perform word breaking for all records: ~ 8 seconds
- Total number of word constituents added to all records: 400646 (about 8.3 additional keywords per record)
Times are as measured on my development machine which is a reasonably fast Intel Core 2 Quad with 2.66 GHz and 4 GB RAM running Windows 7 32 bit. Loading the records from the database takes much longer than the actual word breaking.
While I don’t claim that this is the fastest you can ever do it is definitely fast enough for us.