Foreach var word in words - Word и Excel - помощь в работе с программами

I write a string processor class.You can use it.

Example:

metaKeywords = bodyText.Process(prepositions).OrderByDescending().TakeTop().GetWords().AsString();

Class:

 public static class StringProcessor
{
    private static List<String> PrepositionList;

    public static string ToNormalString(this string strText)
    {
        if (String.IsNullOrEmpty(strText)) return String.Empty;
        char chNormalKaf = (char)1603;
        char chNormalYah = (char)1610;
        char chNonNormalKaf = (char)1705;
        char chNonNormalYah = (char)1740;
        string result = strText.Replace(chNonNormalKaf, chNormalKaf);
        result = result.Replace(chNonNormalYah, chNormalYah);
        return result;
    }

    public static List<KeyValuePair<String, Int32>> Process(this String bodyText,
        List<String> blackListWords = null,
        int minimumWordLength = 3,
        char splitor = ' ',
        bool perWordIsLowerCase = true)
    {
        string[] btArray = bodyText.ToNormalString().Split(splitor);
        long numberOfWords = btArray.LongLength;
        Dictionary<String, Int32> wordsDic = new Dictionary<String, Int32>(1);
        foreach (string word in btArray)
        {
            if (word != null)
            {
                string lowerWord = word;
                if (perWordIsLowerCase)
                    lowerWord = word.ToLower();
                var normalWord = lowerWord.Replace(".", "").Replace("(", "").Replace(")", "")
                    .Replace("?", "").Replace("!", "").Replace(",", "")
                    .Replace("<br>", "").Replace(":", "").Replace(";", "")
                    .Replace("،", "").Replace("-", "").Replace("n", "").Trim();
                if ((normalWord.Length > minimumWordLength && !normalWord.IsMemberOfBlackListWords(blackListWords)))
                {
                    if (wordsDic.ContainsKey(normalWord))
                    {
                        var cnt = wordsDic[normalWord];
                        wordsDic[normalWord] = ++cnt;
                    }
                    else
                    {
                        wordsDic.Add(normalWord, 1);
                    }
                }
            }
        }
        List<KeyValuePair<String, Int32>> keywords = wordsDic.ToList();
        return keywords;
    }

    public static List<KeyValuePair<String, Int32>> OrderByDescending(this List<KeyValuePair<String, Int32>> list, bool isBasedOnFrequency = true)
    {
        List<KeyValuePair<String, Int32>> result = null;
        if (isBasedOnFrequency)
            result = list.OrderByDescending(q => q.Value).ToList();
        else
            result = list.OrderByDescending(q => q.Key).ToList();
        return result;
    }

    public static List<KeyValuePair<String, Int32>> TakeTop(this List<KeyValuePair<String, Int32>> list, Int32 n = 10)
    {
        List<KeyValuePair<String, Int32>> result = list.Take(n).ToList();
        return result;
    }

    public static List<String> GetWords(this List<KeyValuePair<String, Int32>> list)
    {
        List<String> result = new List<String>();
        foreach (var item in list)
        {
            result.Add(item.Key);
        }
        return result;
    }

    public static List<Int32> GetFrequency(this List<KeyValuePair<String, Int32>> list)
    {
        List<Int32> result = new List<Int32>();
        foreach (var item in list)
        {
            result.Add(item.Value);
        }
        return result;
    }

    public static String AsString<T>(this List<T> list, string seprator = ", ")
    {
        String result = string.Empty;
        foreach (var item in list)
        {
            result += string.Format("{0}{1}", item, seprator);
        }
        return result;
    }

    private static bool IsMemberOfBlackListWords(this String word, List<String> blackListWords)
    {
        bool result = false;
        if (blackListWords == null) return false;
        foreach (var w in blackListWords)
        {
            if (w.ToNormalString().Equals(word))
            {
                result = true;
                break;
            }
        }
        return result;
    }
}

Источник

Krya_Krya

0 / 0 / 0

Регистрация: 14.10.2018

Сообщений: 23

20.12.2019, 21:33. Показов 5129. Ответов 3

Метки foreach, новичок, сишарп (Все метки)

Подскажите пожалуйста, как с помощью foreach разбить строку на подстроки? Знаю как со сплитом это сделать, но хочется узнать все варианты. Код со сплитом:

1 2	string[] words = name.Split(' '); Console.Write("{0}n{1}!", words[0], words[1]);

Diamante

3453 / 2461 / 1169

Регистрация: 14.08.2016

Сообщений: 8,153

20.12.2019, 22:10

Krya_Krya, ну примерно так

            var str = "Подскажите пожалуйста, как с помощью foreach разбить строку на подстроки? Знаю как со сплитом это сделать, но хочется узнать все варианты. Код со сплитом:";
            var separators = new char[] { ' ', ',', '.', ':', '?' };
            var words = new List<string>();
            var sb = new StringBuilder();
            foreach (var ch in str)
            {
                if (separators.Contains(ch))
                {
                    if (sb.Length != 0) words.Add(sb.ToString());
                    else continue;
                    sb.Clear();
                    continue;
                }
                else sb.Append(ch);
            }
            foreach (var word in words)
            {
                Console.WriteLine(word);
            }

только зачем?

Enifan

1840 / 1182 / 501

Регистрация: 14.10.2018

Сообщений: 3,179

20.12.2019, 22:15

Krya_Krya, в данному случаи я буду использовать динамический массив (список List<>) а не статический массив (обычный string[]), так как заранее кол-во слов не известно. Конечно можно сделать подсчет и создать статический массив нужного размера — но на это надо дополнительный цикл делать.
Если вы хотите знать базовые алгоритмы — то вам надо изучать Си или C++, там этого добра полно
И кстати в вашем случаи сплит сделает на одну строку больше чем надо (последняя будет пустая), на Метаните написано как этого избежать

using System;
using System.Collections.Generic;
 
class Program
{
    static void Main()
    {
        string str = "Разбить строку на подстроки по разделителю с использованием foreach";
        List<string> words = new List<string>();
        string temp = "";
 
        foreach (char ch in str) // посимвольный перебор
        {
            if (Char.IsLetter(ch)) // если буква любого алфавита
            {
                temp += ch;
            }
            else if (temp != "") // в слове должна быть хоть одна буква
            {
                words.Add(temp); // добавляем слово в список
                temp = ""; // очистка текущего слова, так как в него будет записаваться новое
            }
        }
 
        // последнее слово может быть найдено, но не записано в список
        if (temp != "")
            words.Add(temp);
 
        // вывод всех слов на экран
        foreach(var word in words)
            Console.WriteLine(word);
 
        Console.ReadKey();
    }
}

350 / 245 / 76

Регистрация: 18.03.2016

Сообщений: 979

21.12.2019, 00:01

Enifan, …

Krya_Krya, попробуй через регулярные выражения. Например то что делает Enifan:
https://regex101.com/r/Aee6U8/1

Источник

Let’s set up code to benchmark different approaches. Every word counter will implement this interface:

interface IWordCounter
{
    IDictionary<string, int> CountWords(string path);
}

And here’s our benchmark runner:

var wordCounters = new IWordCounter[]
{
    // ...
};

foreach (var wordCounter in wordCounters)
{
    GC.Collect();
    GC.WaitForPendingFinalizers();

    var sw = Stopwatch.StartNew();
    var wordCount = wordCounter.CountWords(path);
    sw.Stop();

    Console.WriteLine("{0}, {1} entries, {2}", wordCounter.GetType().Name, wordCount.Count, sw.Elapsed);
}

Timings were taken with a release build, on the test file provided, no debugger attached, on .NET 4.5.2.

Here’s the original code:

class OriginalWordCounter : IWordCounter
{
    private static readonly char[] separators = { ' ' };

    public IDictionary<string, int> CountWords(string path)
    {
        var wordCount = new Dictionary<string, int>();

        using (var fileStream = File.Open(path, FileMode.Open, FileAccess.Read))
        using (var streamReader = new StreamReader(fileStream))
        {
            string line;
            while ((line = streamReader.ReadLine()) != null)
            {
                var words = line.Split(separators, StringSplitOptions.RemoveEmptyEntries);

                foreach (var word in words)
                {
                    if (wordCount.ContainsKey(word))
                    {
                        wordCount[word] = wordCount[word] + 1;
                    }
                    else
                    {
                        wordCount.Add(word, 1);
                    }
                }
            }
        }

        return wordCount;
    }
}

On my machine, this takes about 8.2s.

We see an improvement using Heslacher’s suggestion to use TryGet:

class OriginalTryGetWordCounter : IWordCounter
{
    private static readonly char[] separators = { ' ' };

    public IDictionary<string, int> CountWords(string path)
    {
        var wordCount = new Dictionary<string, int>();

        foreach (var line in File.ReadLines(path, Encoding.UTF8))
        {
            var words = line.Split(separators, StringSplitOptions.RemoveEmptyEntries);
            foreach (var word in words)
            {
                int count;
                wordCount.TryGetValue(word, out count);
                wordCount[word] = count + 1;
            }
        }

        return wordCount;
    }
}

This takes about 6.7s. (The use of File.ReadLines here doesn’t seem to effect the timing, it’s just a bit cleaner.)

We get another improvement with Parallel.ForEach together with a ConcurrentDictionary:

class ParallelWordCounter : IWordCounter
{
    public IDictionary<string, int> CountWords(string path)
    {
        var result = new ConcurrentDictionary<string, int>();
        Parallel.ForEach(File.ReadLines(path, Encoding.UTF8), line =>
        {
            var words = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
            foreach (var word in words)
            {
                result.AddOrUpdate(word, 1, (_, x) => x + 1);
            }
        });

        return result;
    }
}

This takes about 5.2s.

You might want to try some of the Parallel.Foreach overloads to see if you can get any further improvements, and remember to take these results with a grain of salt.

Источник

Given a 2D grid of characters and a word/ multiple words, the task is to check if that word/words exist in the grid or not. A word can be matched in 4 directions at any point.
The 4 directions are Horizontally Left and Right, Vertically Up and Down.
Examples:

Input:  grid[][] = {"axmy",
                    "bgdf",
                    "xeet",
                    "raks"};
Output: Yes

a x m y
b g d f
x e e t
r a k s

Input: grid[][] = {"axmy",
                   "brdf",
                   "xeet",
                   "rass"};
Output : No

Source: Microsoft Interview

Approach when a single word is to be checked : The idea used here is described in the steps below:

Check every cell, if the cell has the first character, then recur one by one and try all 4 directions from that cell for a match.
Mark the position in the grid as visited and recur in the 4 possible directions.
After recurring, again mark the position as unvisited.
Once all the letters in the word are matched, return true.

Below is the implementation of the above approach:

C++

#include <bits/stdc++.h>

using namespace std;

#define r 4

#define c 5

bool findmatch(char mat[r], string pat, int x, int y,

int nrow, int ncol, int level)

{

int l = pat.length();

if (level == l)

return true;

if (x < 0 || y < 0 || x >= nrow || y >= ncol)

return false;

if (mat[x][y] == pat[level]) {

char temp = mat[x][y];

mat[x][y] = '#';

bool res = findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);

mat[x][y] = temp;

return res;

}

else

return false;

}

bool checkMatch(char mat[r], string pat, int nrow, int ncol)

{

int l = pat.length();

if (l > nrow * ncol)

return false;

for (int i = 0; i < nrow; i++) {

for (int j = 0; j < ncol; j++) {

if (mat[i][j] == pat[0])

if (findmatch(mat, pat, i, j, nrow, ncol, 0))

return true;

}

return false;

}

int main()

{

char grid[r] = { "axmy",

"bgdf",

"xeet",

"raks" };

if (checkMatch(grid, "geeks", r, c))

cout << "Yes";

else

cout << "No";

return 0;

}

Java

class GFG

{

static final int r = 4;

static final int c = 4;

static boolean findmatch(char mat[][], String pat, int x, int y,

int nrow, int ncol, int level)

{

int l = pat.length();

if (level == l)

return true;

if (x < 0 || y < 0 || x >= nrow || y >= ncol)

return false;

if (mat[x][y] == pat.charAt(level))

{

char temp = mat[x][y];

mat[x][y] = '#';

boolean res = findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);

mat[x][y] = temp;

return res;

}

else

return false;

}

static boolean checkMatch(char mat[][], String pat, int nrow, int ncol)

{

int l = pat.length();

if (l > nrow * ncol)

return false;

for (int i = 0; i < nrow; i++)

{

for (int j = 0; j < ncol; j++)

{

if (mat[i][j] == pat.charAt(0))

if (findmatch(mat, pat, i, j, nrow, ncol, 0))

return true;

}

return false;

}

public static void main(String[] args)

{

char grid[][] = { "axmy".toCharArray(),

"bgdf".toCharArray(),

"xeet".toCharArray(),

"raks".toCharArray() };

if (checkMatch(grid, "geeks", r, c))

System.out.print("Yes");

else

System.out.print("No");

}

Python3

r = 4

c = 4

def findmatch(mat, pat, x, y,

nrow, ncol, level) :

l = len(pat)

if (level == l) :

return True

if (x < 0 or y < 0 or

x >= nrow or y >= ncol) :

return False

if (mat[x][y] == pat[level]) :

temp = mat[x][y]

mat[x].replace(mat[x][y], "#")

res = (findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1))

mat[x].replace(mat[x][y], temp)

return res

else :

return False

def checkMatch(mat, pat, nrow, ncol) :

l = len(pat)

if (l > nrow * ncol) :

return False

for i in range(nrow) :

for j in range(ncol) :

if (mat[i][j] == pat[0]) :

if (findmatch(mat, pat, i, j,

nrow, ncol, 0)) :

return True

return False

if __name__ == "__main__" :

grid = ["axmy", "bgdf",

"xeet", "raks"]

if (checkMatch(grid, "geeks", r, c)) :

print("Yes")

else :

print("No")

Javascript

<script>

let r = 4;

let c = 4;

function findmatch(mat, pat, x, y, nrow, ncol, level)

{

let l = pat.length;

if (level == l)

return true;

if (x < 0 || y < 0 || x >= nrow || y >= ncol)

return false;

if (mat[x][y] == pat[level])

{

let temp = mat[x][y];

mat[x][y] = '#';

let res =

findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);

mat[x][y] = temp;

return res;

}

else

return false;

}

function checkMatch(mat, pat, nrow, ncol)

{

let l = pat.length;

if (l > nrow * ncol)

return false;

for (let i = 0; i < nrow; i++)

{

for (let j = 0; j < ncol; j++)

{

if (mat[i][j] == pat[0])

if (findmatch(mat, pat, i, j, nrow, ncol, 0))

return true;

}

return false;

}

let grid = [ "axmy".split(''),

"bgdf".split(''),

"xeet".split(''),

"raks".split('') ];

if (checkMatch(grid, "geeks", r, c))

document.write("Yes");

else

document.write("No");

</script>

C#

using System;

class GFG

{

static readonly int r = 4;

static readonly int c = 4;

static bool findmatch(char [,]mat, String pat, int x, int y,

int nrow, int ncol, int level)

{

int l = pat.Length;

if (level == l)

return true;

if (x < 0 || y < 0 || x >= nrow || y >= ncol)

return false;

if (mat[x, y] == pat[level])

{

char temp = mat[x, y];

mat[x, y] = '#';

bool res = findmatch(mat, pat, x - 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x + 1, y, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y - 1, nrow, ncol, level + 1) |

findmatch(mat, pat, x, y + 1, nrow, ncol, level + 1);

mat[x, y] = temp;

return res;

}

else

return false;

}

static bool checkMatch(char [,]mat, String pat, int nrow, int ncol)

{

int l = pat.Length;

if (l > nrow * ncol)

return false;

for (int i = 0; i < nrow; i++)

{

for (int j = 0; j < ncol; j++)

{

if (mat[i, j] == pat[0])

if (findmatch(mat, pat, i, j, nrow, ncol, 0))

return true;

}

return false;

}

public static void Main(String[] args)

{

char [,]grid = { {'a','x','m','y'},

{'b','g','d','f'},

{'x','e','e','t'},

{'r','a','k','s'} };

if (checkMatch(grid, "geeks", r, c))

Console.Write("Yes");

else

Console.Write("No");

}

Time Complexity: O(r*c), as we are using recursion to traverse the matrix. Where r and c are the rows and columns of the grid.

Auxiliary Space: O(r*c), as we are using extra space for the matrix. Where r and c are the rows and columns of the grid.

Approach when a group of words are to be checked : The idea used here is described in the steps below:

iterate through group of words and check every cell, if the cell has the first character, then recur one by one and try all 4 directions from that cell for a match.
Mark the position in the grid as visited and recur in the 4 possible directions.
After recurring, again mark the position as unvisited.
Once all the letters in the word are matched, return true, put it in answer list.
return the answer list from a function and display

C++

#include <bits/stdc++.h>

using namespace std;

class Solution {

public:

vector<vector<int> > mover

= { { 1, 0 }, { 0, 1 }, { -1, 0 }, { 0, -1 } };

vector<vector<char> > board;

bool dfs(int x, int y, string& s,

vector<vector<bool> > vis)

{

if (s.length() == 0)

return true;

vis[x][y] = true;

bool sol = false;

for (int i = 0; i < mover.size(); i++) {

int curr_x = mover[i][0] + x;

int curr_y = mover[i][1] + y;

if (curr_x >= 0 && curr_x < board.size()) {

if (curr_y >= 0

&& curr_y < board[0].size()) {

if (board[curr_x][curr_y] == s[0]

&& vis[curr_x][curr_y] == false) {

string k = s.substr(

1);

sol |= dfs(curr_x, curr_y, k, vis);

}

return sol;

}

vector<string> findWords(vector<vector<char> >& board,

vector<string>& words)

{

this->board

= board;

vector<string> ans;

vector<vector<bool> > vis(

board.size(),

vector<bool>(board[0].size(),

false));

for (auto& word : words) {

for (int i = 0; i < board.size(); i++) {

for (int j = 0; j < board[i].size(); j++) {

if (board[i][j] == word[0]) {

string s = word.substr(1);

if (dfs(i, j, s, vis)) {

ans.push_back(word);

break;

}

if (ans.size() && ans.back() == word)

break;

}

return ans;

}

};

int main()

{

Solution solver;

vector<vector<char> > board

= { { 'o', 'a', 'a', 'n' },

{ 'e', 't', 'a', 'e' },

{ 'i', 'h', 'k', 'r' },

{ 'i', 'f', 'l', 'v' } };

vector<string> words = { "oath", "pea", "eat", "rain" };

vector<string> ans = solver.findWords(board, words);

cout << "words present:n";

for (auto& part : ans)

cout << part << endl;

return 0;

}

Java

import java.util.*;

class Solution {

int[][] mover = {{1, 0}, {0, 1}, {-1, 0}, {0, -1}};

char[][] board;

public boolean dfs(int x, int y, String s, boolean[][] vis) {

if (s.length() == 0) {

return true;

}

vis[x][y] = true;

boolean sol = false;

for (int i = 0; i < mover.length; i++) {

int curr_x = mover[i][0] + x;

int curr_y = mover[i][1] + y;

if (curr_x >= 0 && curr_x < board.length) {

if (curr_y >= 0 && curr_y < board[0].length) {

if (board[curr_x][curr_y] == s.charAt(0) && !vis[curr_x][curr_y]) {

String k = s.substring(1);

sol |= dfs(curr_x, curr_y, k, vis);

}

return sol;

}

public List<String> findWords(char[][] board, String[] words) {

this.board = board;

List<String> ans = new ArrayList<>();

for (String word : words) {

boolean[][] vis = new boolean[board.length][board[0].length];

for (int i = 0; i < board.length; i++) {

for (int j = 0; j < board[i].length; j++) {

if (board[i][j] == word.charAt(0)) {

String s = word.substring(1);

if (dfs(i, j, s, vis)) {

ans.add(word);

break;

}

if (!ans.isEmpty() && ans.get(ans.size() - 1).equals(word)) {

break;

}

return ans;

}

class Main {

public static void main(String[] args) {

Solution solver = new Solution();

char[][] board = { {'o','a','a','n'}, {'e','t','a','e'}, {'i','h','k','r'}, {'i','f','l','v'} };

String[] words = {"oath","pea","eat","rain"};

System.out.println("Words present: ");

List<String> arr = solver.findWords(board, words);

for (int i = 0; i < arr.size(); i++) {

System.out.println(arr.get(i));

}

Python3

class Solution:

mover = [ [1, 0], [0, 1], [-1, 0], [0, -1] ]

board = []

def dfs(self, x, y, s, vis):

if len(s) == 0:

return True

vis[x][y] = True

sol = False

for i in range(len(self.mover)):

curr_x = self.mover[i][0] + x

curr_y = self.mover[i][1] + y

if curr_x >= 0 and curr_x < len(self.board):

if curr_y >= 0 and curr_y < len(self.board[0]):

if self.board[curr_x][curr_y] == s[0] and vis[curr_x][curr_y] == False:

k = s[1:]

sol |= self.dfs(curr_x, curr_y, k, vis)

return sol

def findWords(self, board, words):

self.board = board

ans = []

for word in words:

vis = [[False for _ in range(len(board[0]))] for _ in range(len(board))]

for i in range(len(board)):

for j in range(len(board[i])):

if board[i][j] == word[0]:

s = word[1:]

if self.dfs(i, j, s, vis):

ans.append(word)

break

if ans and ans[-1] == word:

break

return ans

solver = Solution()

board = [ ['o','a','a','n'], ['e','t','a','e'], ['i','h','k','r'], ['i','f','l','v'] ]

words = ["oath","pea","eat","rain"]

print("Words present: ")

arr=solver.findWords(board, words)

for i in range(len(arr)):

print(arr[i])

Javascript

class Solution {

mover = [ [1, 0], [0, 1], [-1, 0], [0, -1] ];

board = [];

dfs(x, y, s, vis) {

if (s.length === 0) {

return true;

}

vis[x][y] = true;

let sol = false;

for (let i = 0; i < this.mover.length; i++) {

let curr_x = this.mover[i][0] + x;

let curr_y = this.mover[i][1] + y;

if (curr_x >= 0 && curr_x < this.board.length) {

if (curr_y >= 0 && curr_y < this.board[0].length) {

if (this.board[curr_x][curr_y] == s[0] && vis[curr_x][curr_y] == false) {

let k = s.substring(1);

sol |= this.dfs(curr_x, curr_y, k, vis);

}

return sol;

}

findWords(board, words) {

this.board = board;

let ans = [];

for (let word of words) {

let vis = new Array(board.length).fill(false).map(() => new Array(board[0].length).fill(false));

for (let i = 0; i < board.length; i++) {

for (let j = 0; j < board[i].length; j++) {

if (board[i][j] == word[0]) {

let s = word.substring(1);

if (this.dfs(i, j, s, vis)) {

ans.push(word);

break;

}

if (ans.length && ans[ans.length - 1] == word) {

break;

}

return ans;

}

let solver = new Solution();

let board = [

['o','a','a','n'],

['e','t','a','e'],

['i','h','k','r'],

['i','f','l','v']

];

let words = ["oath","pea","eat","rain"];

const foundWords = solver.findWords(board, words);

console.log("Words present:");

foundWords.forEach(word => console.log(word));

C#

using System;

using System.Collections.Generic;

using System.Linq;

class Solution {

private static readonly int[][] Moves = {

new[] {1, 0}, new[] {0, 1}, new[] {-1, 0}, new[] {0, -1}

};

private char[][] _board;

public IList<string> FindWords(char[][] board, string[] words) {

_board = board;

var result = new List<string>();

foreach (var word in words) {

if (Exists(word)) {

result.Add(word);

}

return result;

}

private bool Exists(string word) {

for (var i = 0; i < _board.Length; i++) {

for (var j = 0; j < _board[0].Length; j++) {

if (_board[i][j] == word[0] && Search(word, i, j, 0)) {

return true;

}

return false;

}

private bool Search(string word, int i, int j, int k) {

if (k == word.Length) {

return true;

}

if (i < 0 || j < 0 || i == _board.Length || j == _board[0].Length) {

return false;

}

if (_board[i][j] != word[k]) {

return false;

}

var temp = _board[i][j];

_board[i][j] = '#';

foreach (var move in Moves) {

if (Search(word, i + move[0], j + move[1], k + 1)) {

_board[i][j] = temp;

return true;

}

_board[i][j] = temp;

return false;

}

public class Program {

public static void Main() {

var solver = new Solution();

var board = new char[][] {

new char[] {'o', 'a', 'a', 'n'},

new char[] {'e', 't', 'a', 'e'},

new char[] {'i', 'h', 'k', 'r'},

new char[] {'i', 'f', 'l', 'v'}

};

var words = new string[] {"oath", "pea", "eat", "rain"};

var ans = solver.FindWords(board, words);

Console.WriteLine("Words present:");

foreach (var word in ans) {

Console.WriteLine(word);

}

Output

words present:
oath
eat

Time Complexity: O(r*c*len(words)*number of words), as we are using recursion to traverse the matrix. Where r and c are the rows and columns of the grid.

Auxiliary Space: O(r*c*number of words), as we are using extra space for the matrix. Where r and c are the rows and columns of the grid.

Источник

Word breakers split longer words (e. g. compound nouns or compounds) into meaningful constituents. They are used with search engines that do not support infix search. While there aren’t many compounds in spoken English the problem is more immanent in languages like German or Swedish. Take for example the notorious “Donaudampfschiffahrtsgesellschaftskapitänsmütze”: A word that is made up from at least seven different words, depending on how you count. The problem is how to split possible compounds into their constituents in an efficient way.

This problem arose from a specific limitation of the Microsoft SQL Server 2008 we were using for searching sales item texts in a project. Its full text search engine doesn’t support infix search, only prefix search. This means that, for example, the word “Wildrahmsauce” (venison cream sauce) would not be matched by the query words “Rahm” or “Sauce”. However, this was what our users wanted.

Although SQL Server 2008 includes a word breaker for the German language it is not dictionary-based but rule-based. As such it fails to recognize the components of “Wildrahmsauce”. Moreover we would like to store item texts in three languages (German, French, Italian) in the same table so we would have to make a distinction on a per-record basis, something a SQL server word breaker can’t do. Unfortunately, they can produce other strange side effects as well such as not matching certain prefixes (!), so we had to disable language features entirely. You can apparently implement a DLL and register it as a word breaker with SQL Server, however, we felt that this approach would be rather time consuming and perhaps not robust enough.

A SQL query using LIKE on our approximately 50’000 records proved to be too slow. Holding the complete article list in an indexed structure in RAM or using a separate search component was also not an option. To be able to use SQL Server’s fulltext engine, we needed a way to enrich our search terms with the missing keywords. This meant that we had to implement a word breaker to recognize the words “Rahm” and “Sauce” within “Wildrahmsauce”. Note that the word “Wild” is not required to be a separately indexed word because SQL Server can do prefix matching already.

The first step is to decide on a useful dictionary of possible constituents. Obviously the size of the dictionary affects the performance of the splitting algorithm. It is therefore good practice to use a dictionary of minimal size. In our case we are using the words in the article texts themselves. This is based on the observation that the words “Rahm” and “Sauce” occur as separate words in other article texts, and the same is true for many other words that also appear as constituents of compounds. There are very few cases of constituents that appear only in compounds; these obviously cannot be detected automatically. We add them manually when our users require it.

It is also important that the minimum length of the dictionary words should not be too small. In this case we decided on a minimum length of 3 characters. A smaller limit would lead to many insignificant additional parts while providing little benefit for actual searches that typically contain three characters or more per search keyword.

The word breaker algorithm itself consists of two steps. First, an index is built from the dictionary words. Second, the words that require splitting are matched against this index to determine their constituents.

The implementation of the word breaker I present here is quite naïve. It doesn’t claim to be particularly efficient in respect to both time and space. However, it is easy to understand, quick to implement and fast enough for our purposes.

All search terms and dictionary words are normalized before using them, i. e. we

replace non-alphanumeric characters by blanks,
replace sequences of whitespace characters with one blank,
replace accented characters and substitute umlauts (e. g. ‘ä’) with their non-umlaut equivalents (e. g. ‘ae’),
trim the remaining string and
capitalize it.

An example normalization function is shown here. The same function should be used to normalize query input as well before performing the actual search.

/// <summary>
/// Normalize string (remove special chars, replace accented characters,
/// substitute/expand umlauts, remove whitespace)
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
public static string NormalizeString(string input)
{
    // Replace special chars with blanks
    var splCharKiller = new Regex(@"[^A-Z0-9ÁÀÂÄÉÈÊËÍÌÎÏÓÒÔÖÚÙÛÜçß]");
    var r1 = splCharKiller.Replace(input.ToUpper().Trim(), @" ");

    // Replace multiple whitespace chars with one
    var whitespaceKiller = new Regex(@"s{2,}");
    var result = new StringBuilder(whitespaceKiller.Replace(r1, @" "));

    // Replace or expand accented characters and umlauts
    result.Replace('Á', 'A');
    result.Replace('À', 'A');
    result.Replace('Â', 'A');
    result.Replace(@"Ä", @"AE");
    result.Replace('É', 'E');
    result.Replace('È', 'E');
    result.Replace('Ê', 'E');
    result.Replace('Ë', 'E');
    result.Replace('Í', 'I');
    result.Replace('Ì', 'I');
    result.Replace('Î', 'I');
    result.Replace('Ï', 'I');
    result.Replace('Ó', 'O');
    result.Replace('Ò', 'O');
    result.Replace('Ô', 'O');
    result.Replace(@"Ö", @"OE");
    result.Replace('Ú', 'U');
    result.Replace('Ù', 'U');
    result.Replace('Û', 'U');
    result.Replace(@"Ü", @"UE");
    result.Replace('ç', 'C');
    result.Replace(@"ß", @"SS");

    return result.ToString().Trim();
}

A very simple (and inefficient) solution of the word breaker problem in pseudo code is:

for each compound {
    for each possible_word {
        if (compound.Contains(possible_word)) {
            results.Add(possible_word)
        }
    }
}

It’s easy to see that the worst case runtime is O(n * m) where n is the number of compounds and m is the number of possible words. This is far too inefficient for practical purposes, as m almost equals n in our case (remember, we are taking the words from the article texts as possible constituents). The fairly naïve approach we are taking here is to reduce m; while this doesn’t reduce the worst case time complexity of the algorithm we can gain a great speed improvement in practice.

Thus, a refined solution might look like this:

for each compound {
    for each character except the first one {
        get possible_words starting with this character
        for each possible_word {
            if (compound contains the word at the character’s position) {
                results.Add(possible_word)
            }
        }
    }
}

What we do here is reduce the list of possible words by testing only against those words that start with the character we are currently looking at. Getting the list using a hash table can be done in O(1) assuming the hash has been built beforehand. Building the hash is done in linear time, i.e. O(m).

Note that the introduction of the character loop doesn’t introduce an additional factor into the time complexity formula. It just makes the Contains operation of the simple solution explicit, and we can assume that string length is constant for the purposes of this comparison (string length is limited to 255 characters by definition).

However, the speed gain is not yet big enough. One improvement we can still make is to take only those possible words into account that are equal in length or shorter than the rest of the remaining possible compound. Another improvement is to regard two leading characters instead of one. These two conditions greatly reduce the number of words we have to try against the compound.

The resulting data structure consists of word lists indexed by two properties: first, the length of the words, and second, the first two characters of the words. To improve speed the indexes are implemented as dictionaries. In C#, the type is essentially

Dictionary<int, Dictionary<string, List<string>>>

where the int is the word length, the string is the first two characters and the lists contain the suitable words. The Dictionary<string, List<string>> part is implemented as its own class TwoCharIndex, mainly to improve readability.

Here is the code for the word breaker. You can freely use it in commercial or non-commercial projects and modify it according to your needs.

using System;
using System.Collections.Generic;

namespace StringTools
{

    /// <summary>
    /// Index strings by their first two characters.
    /// </summary>
    class TwoCharIndex : Dictionary<string, List<string>>
    {
        /// <summary>
        /// Inserts the word into the index. Assumes it has been case normalized.
        /// </summary>
        /// <param name="word"></param>
        public void Insert(string word)
        {
            if (word.Length < 2) throw new ArgumentException(@"word length must be at least 2");
            List<string> wordList;
            if (!TryGetValue(word.Substring(0, 2), out wordList))
            {
                wordList = new List<string>();
                this[word.Substring(0, 2)] = wordList;
            }
            wordList.Add(word);
        }

        public bool Find(string word)
        {
            if (word.Length < 2) throw new ArgumentException(@"word length must be at least 2");
            List<string> wordList;
            if (TryGetValue(word.Substring(0, 2), out wordList))
            {
                return wordList.Contains(word);
            }
            return false;
        }
    }

    /// <summary>
    /// Builds an index based on a given dictionary and provides a Break method to split longer words
    /// into constituents from this dictionary.
    /// </summary>
    public class WordBreakerIndex
    {
        readonly Dictionary<int, TwoCharIndex> wordIndexes = new Dictionary<int, TwoCharIndex>();
        private const int MinLength = 3;
        private readonly int minLength;

        /// <summary>
        /// Create and build the word breaker index based on the dictionary of given words.
        /// </summary>
        /// <param name="words">The dictionary of words that are used for breaking</param>
        /// <param name="aMinLength">The minimum length of words that are used (default = 3)</param>
        public WordBreakerIndex(IEnumerable<string> words, int aMinLength = MinLength)
        {
            minLength = aMinLength;
            // iterate over words
            foreach (var word in words)
            {
                // take only longer words
                if (word.Length < minLength) continue;

                // get or create index
                TwoCharIndex wordIndex;
                if (!wordIndexes.TryGetValue(word.Length, out wordIndex))
                {
                    // create a new index for words of this length
                    wordIndex = new TwoCharIndex();
                    wordIndexes[word.Length] = wordIndex;
                }

                // insert the word into the index
                wordIndex.Insert(word);
            }
        }

        /// <summary>
        /// Attempt to split the word into smaller ones that the word does not begin with (infix or suffix words).
        /// </summary>
        /// <param name="word"></param>
        /// <returns></returns>
        public IEnumerable<string> Break(string word)
        {
            var result = new HashSet<string>();

            // Iterate over the word's characters starting from the second one, because
            // the first character (or word component) would be matched by prefix search anyway.
            var i = 1;
            while (true)
            {
                var sufLength = word.Length - i;
                if (sufLength < minLength) break;
                // try to match the remaining lengths
                for (var l = sufLength; l >= minLength; l--)
                {
                    var wordPart = word.Substring(i, l);
                    // get index for this length
                    TwoCharIndex wordIndex;
                    if (wordIndexes.TryGetValue(l, out wordIndex))
                    {
                        // there are words with this length
                        if (wordIndex.Find(wordPart))
                            result.Add(wordPart);
                    }
                }
                i++;
            }

            return result;
        }

        public int GetWordCount()
        {
            var result = 0;
            foreach (var tci in wordIndexes.Values)
                foreach (var twi in tci.Values)
                    result += twi.Count;
            return result;
        }
    }
}

And here’s some statistics for our input data (in three languages):

Total records: 47999
Total characters: 3’767’975
Total words (non-distinct, includes one- and two-letter words): 614081
Characters per record: min 25, avg 78.5, max 118
Words per record: min 2, avg 12.8, max 24
Number of unique words in dictionary index (word length > 2): 95210
Time to build dictionary indexes and perform word breaking for all records: ~ 8 seconds
Total number of word constituents added to all records: 400646 (about 8.3 additional keywords per record)

Times are as measured on my development machine which is a reasonably fast Intel Core 2 Quad with 2.66 GHz and 4 GB RAM running Windows 7 32 bit. Loading the records from the database takes much longer than the actual word breaking.

While I don’t claim that this is the fastest you can ever do it is definitely fast enough for us.

Источник