Top K Frequent Words

Given a list of words and an integer k, return the top k frequent words in the list.

You should order the words by the frequency of them in the return list, the most frequent one comes first. If two words has the same frequency, the one with lower alphabetical order come first.

http://www.lintcode.com/en/problem/top-k-frequent-words/

Discussion

算法很简单:1. 统计次数;2.根据次数和string的二元组building heap;3.堆排序,只是输出前K个元素就好了。这个问题是Top K问题的变形,完全是可以深入讨论的问题,具体看follow up的分析。

Solution

用默认的priority_queue加pair实现本来很简单的,但是题目要求If two words has the same frequency, the one with lower alphabetical order come first,这样的话单看frequency用默认的最大堆,而string的比较要求用最小堆了。

默认pair的operator,通不过

vector<string> topKFrequentWords(vector<string>& words, int k) {
        unordered_map<string, int> mymap;
        for(auto const& word : words) {
            mymap[word]++;
        }
        vector<string> result;
        priority_queue<pair<int, string>> que;
        for(auto const& v : mymap) {
            que.push(make_pair(v.second, v.first));
        }
        while(que.empty() == false && k>0) {
            auto v = que.top();
            que.pop();
            k--;
            result.emplace_back(v.second);
        }
        return result;
    }

自定义一个Comparison object。通过

class Solution {
public:
    /**
     * @param words an array of string
     * @param k an integer
     * @return an array of string
     */
     struct cmp{
        bool operator() (pair<int, string> a, pair<int, string> b) {
            if(a.first == b.first) return a.second > b.second;
            return a.first < b.first;
        }
    };
    vector<string> topKFrequentWords(vector<string>& words, int k) {
        unordered_map<string, int> mymap;
        for(auto const& word : words) {
            mymap[word]++;
        }
        vector<string> result;
        priority_queue<pair<int, string>, vector<pair<int, string>>, cmp > que;
        for(auto const& v : mymap) {
            que.push(make_pair(v.second, v.first));
        }
        while(que.empty() == false && k>0) {
            auto v = que.top();
            que.pop();
            k--;
            result.emplace_back(v.second);
        }
        return result;
    }

};

自定义节点类型,重载operator <

class Node {
public:
    int cnt; //frequency
    string word; //word
    Node(int x, string s) {
        cnt = x;
        word = s;
    }
    bool operator< (const Node &rhs) const {//不要忘记const
        if(cnt == rhs.cnt) return word > rhs.word;
        return cnt < rhs.cnt;
    }
};
class Solution {
public:
    /**
     * @param words an array of string
     * @param k an integer
     * @return an array of string
     */
    vector<string> topKFrequentWords(vector<string>& words, int k) {
        unordered_map<string, int> map;
        //statistic of each word
        for(auto const& word : words) {
            mymap[word]++;
        }
        //build up heap
        priority_queue<Node>myque;
        for(auto const& n : map) {
            myque.push(Node(n.second, n.first));
        }
        //find top K
        vector<string> result;
        while(myque.empty() == false && k>0) {
            result.emplace_back(myque.top().word);
            myque.pop();
            k--;
        }
        return result;
    }
};

Follow up

上述算法的复杂度分析:

  • 统计次数 O(n) run time, O(n) space for map
  • 建堆: O(n) run time, O(n) space for heap
  • 输出: O(k*logn)
  • 总的复杂度:O(n) + O(klogn) 取决于k是什么级别的。

Solution 2

没有必要对N个元素建堆,对前K个元素建个最小堆,堆顶元素表示最大的K个元素中的最小值。然后维护这最小堆就可以了。总的复杂度:O(k) (建堆) + O((n-k)logk) (维护) + O(k*logk) (输出)

public:
    int cnt; //frequency
    string word; //word
    Node(int x, string s) {
        cnt = x;
        word = s;
    }
    bool operator< (const Node &rhs) const {
        if(cnt == rhs.cnt) return word > rhs.word;
        return cnt < rhs.cnt;
    }
};

struct cmp {
    bool operator() (Node &A, Node &B) {
        if(A.cnt == B.cnt) return A.word < B.word;
        return A.cnt > B.cnt;
    }
};
class Solution {
public:
    /**
     * @param words an array of string
     * @param k an integer
     * @return an array of string
     */
    vector<string> topKFrequentWords(vector<string>& words, int k) {
        unordered_map<string, int> map;
        //statistic of each word
        for(auto const& word : words) {
            if(map.find(word) == map.end()) {
                map[word] = 1;
            } else {
                map[word] += 1;
            }
        }
        vector<string> result;
        if(map.size() < k) return result;//something wrong
        if(k == 0) return result;

        //build up a k-heap,根据前K个元素建个最小堆,表示前K个最大元素,堆顶就是这个K个里面最小的那个
        //priority_queue<Node>myque;
        priority_queue<Node, vector<Node>, cmp>myque;//自定义数据类型最小堆
        auto it = map.begin();
        for(int n = 0; n <k; n++) {
            myque.push(Node(it->second, it->first));
            advance(it, 1);
        }
        //maintain this heap
        while(it!=map.end()) {
            Node cur = Node(it->second, it->first);
            if(myque.top()<cur) {
                myque.pop();
                myque.push(cur);
            }
            advance(it, 1);
        }
        while(myque.empty() == false) {
            result.emplace_back(myque.top().word);
            myque.pop();
            k--;
        }
        reverse(result.begin(), result.end());
        return result;
    }
};

Follow up again

用unordered_map统计频率不是上策,因为有大量单词的话会造成空间耗费过大。用来查询单词的天然的数据结构是Tire,也是O(1)的时间,严格的说是O(k)的时间,k是单词长度。也可以用来统计频率。

Add and Search Word这个题就是讲Trie的实现的。

另一种减少空间时间用的方法就是把unordered_map里用的string转换成int, 通过编码来实现,但这个对string的长度有限制,因为int最多32位,要encode26个字符至少需要5位(2^5=32>26),如果再算大小写的话就更不够了。关于string转int来编码省空间可以参考Repeated DNA Sequences那道题

class WordDictionary {
public:
    struct TrieNode {
        int cnt =0;
        unordered_map<char, TrieNode*> neighbors;
    };
    WordDictionary() {
        _root = new TrieNode();
        _root->cnt = 0;
    }
    void add(string word) {
        TrieNode* cur = _root;
        for(auto const& c : word) {
            if(cur->neighbors.find(c) == cur->neighbors.end()) {
                cur->neighbors[c] = new TrieNode();
            }
            cur = cur->neighbors[c];
        }
        cur->cnt++;
    }
    int search(string word) {
        return searchHelper(word, 0, _root);
    }
private:
    TrieNode* _root;
    int searchHelper(string &word, int pos, TrieNode *node) {
        if(pos == word.length()) {
            return node->cnt;
        }
        if(node->neighbors.find(word[pos]) == node->neighbors.end()) {
            return 0;
        }
        return searchHelper(word, pos+1, node->neighbors[word[pos]]);
    }
};

class Solution {
public:
    /**
     * @param words an array of string
     * @param k an integer
     * @return an array of string
     */
    struct Node{
        int cnt;
        string word;
        Node(int freq, string w) {
            cnt = freq;
            word = w;
        }
        bool operator< (const Node& B) const {
            if(cnt == B.cnt) return word > B.word;
            return cnt < B.cnt;
        }
    };
    vector<string> topKFrequentWords(vector<string>& words, int k) {
        //build up word dictionary, saving the frequency info
        WordDictionary dict;
        for(auto const& w:words) {
            dict.add(w);
        }
        //build a heap of <string, int> node
        priority_queue<Node> heap;
        unordered_set<string> is_visited;//make sure words in heap not repeated
        for(auto const& w : words) {
            if(is_visited.find(w) != is_visited.end()) {
                continue;
            }
            is_visited.emplace(w);
            int cnt = dict.search(w);
            heap.emplace(Node(cnt, w));
        }
        //output top K
        vector<string> result;
        while(k>0 && heap.empty()==false) {
            result.emplace_back(heap.top().word);
            heap.pop();
            k--;
        }
        return result;
    }

};

Reference

results matching ""

    No results matching ""