蚁群算法在Python和C++最接近字符串问题中的异常行为

2024-10-17 00:30:16 发布

您现在位置:Python中文网/ 问答频道 /正文

这可能是一个很长的问题,我提前道歉

我正在做一个项目,目标是为最接近的字符串问题研究不同的解决方案

Let s_1, ... s_n be strings of length m. Find a string s of length m such that it minimizes max{d(s, s_i) | i = 1, ..., n}, where d is the hamming distance.

已经尝试过的一种解决方案是使用蚁群优化,如here所述

这篇论文本身并没有讨论实现细节,所以我在效率方面已经尽了最大的努力。然而,效率并不是唯一不寻常的行为

我不确定这样做是否是常见的做法,但我将通过pastebin展示我的代码,因为我相信如果我直接把它放在这里,它会淹没线程。如果这是一个问题,我不介意编辑线程直接放在这里 正如我之前试验过的所有算法一样,我最初用python编写了这个算法。代码如下:

def solve_(self, problem: CSProblem) -> CSSolution:
        m, n, alphabet, strings = problem.m, problem.n, problem.alphabet, problem.strings
        A = len(alphabet)
        rho = self.config['RHO']
        colony_size = self.config['COLONY_SIZE']
 
        global_best_ant = None
        global_best_metric = m
 
        ants = np.full((colony_size, m), '')
        world_trails = np.full((m, A), 1 / A)
 
 
 
        for iteration in range(self.config['MAX_ITERS']):
            local_best_ant = None
            local_best_metric = m
            for ant_idx in range(colony_size):
 
                for next_character_index in range(m):
                    ants[ant_idx][next_character_index] = random.choices(alphabet, weights=world_trails[next_character_index], k=1)[0]
 
                ant_metric = utils.problem_metric(ants[ant_idx], strings)
 
                if ant_metric < local_best_metric:
                    local_best_metric = ant_metric
                    local_best_ant = ants[ant_idx]
 
            # First we perform pheromone evaporation
            for i in range(m):
                for j in range(A):
                    world_trails[i][j] = world_trails[i][j] * (1 - rho)
 
            # Now, using the elitist strategy, only the best ant is allowed to update his pheromone trails
            best_ant_ys = (alphabet.index(a) for a in local_best_ant)
            best_ant_xs = range(m)
 
            for x, y in zip(best_ant_xs, best_ant_ys):
                world_trails[x][y] = world_trails[x][y] + (1 - local_best_metric / m)
 
            if local_best_metric < global_best_metric:
                global_best_metric = local_best_metric
                global_best_ant = local_best_ant
 
        return CSSolution(''.join(global_best_ant), global_best_metric)

utils.problem_metric函数如下所示:

def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def problem_metric(string, references):
    return max(hamming_distance(string, r) for r in references)

我已经看到,有很多调整和其他参数,你可以添加到ACO,但我一直保持简单,现在。我使用的配置是250次迭代,蚁群大小为10只蚂蚁,rho=0.1。我测试它的问题就在这里:http://tcs.informatik.uos.de/research/csp_cssp,一个叫做2-10-250-1-0.csp(第一个)。字母表仅由“0”和“1”组成,字符串长度为250,总共有10个字符串

对于我提到的ACO配置,使用python解算器,这个问题平均在5秒内得到解决,平均目标函数值为108.55(模拟20次)。正确的目标函数值为96。具有讽刺意味的是,与我第一次尝试实现此解决方案时的平均值相比,5秒的平均值是不错的。然而,它仍然惊人地慢

在进行各种优化之后,我决定尝试在C++中实现完全相同的解决方案,这样就可以看出运行时间之间是否存在显著差异。下面是C++解决方案:

#include <iostream>
#include <vector>
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <random>
#include <chrono>
#include <map>
 
class CSPProblem{
public:
    int m;
    int n;
    std::vector<char> alphabet;
    std::vector<std::string> strings;
 
    CSPProblem(int m, int n, std::vector<char> alphabet, std::vector<std::string> strings)
        : m(m), n(n), alphabet(alphabet), strings(strings)
    {
        
    }  
 
    static CSPProblem from_csp(std::string filepath){
        std::ifstream file(filepath);
        std::string line;
 
        std::vector<std::string> input_lines;
        
        while (std::getline(file, line)){
            input_lines.push_back(line);
        }
 
        int alphabet_size = std::stoi(input_lines[0]);
        int n = std::stoi(input_lines[1]);
        int m = std::stoi(input_lines[2]);
        std::vector<char> alphabet;
        for (int i = 3; i < 3 + alphabet_size; i++){
            alphabet.push_back(input_lines[i][0]);
        }
        std::vector<std::string> strings;
        for (int i = 3 + alphabet_size; i < input_lines.size(); i++){
            strings.push_back(input_lines[i]);
        }
 
        return CSPProblem(m, n, alphabet, strings);
    
    }
 
    int hamm(const std::string& s1, const std::string& s2) const{
        int h = 0;
        for (int i = 0; i < s1.size(); i++){
            if (s1[i] != s2[i])
                h++;
        }
        return h;
    }
 
    int measure(const std::string& sol) const{
        int mm = 0;
        for (const auto& s: strings){
            int h = hamm(sol, s);
            if (h > mm){
                mm = h;
            }
        }
        return mm;
    }
 
    friend std::ostream& operator<<(std::ostream& out, CSPProblem problem){
        out << "m: " << problem.m << std::endl;
        out << "n: " << problem.n << std::endl;
        out << "alphabet_size: " << problem.alphabet.size() << std::endl;
        out << "alphabet: ";
        for (const auto& a: problem.alphabet){
            out << a << " ";
        }
        out << std::endl;
        out << "strings:" << std::endl;
        for (const auto& s: problem.strings){
            out << "\t" << s << std::endl;
        }
        return out;
    }
};
 
 
std::random_device rd;
std::mt19937 gen(rd());
 
int get_from_distrib(const std::vector<float>& weights){
    
    std::discrete_distribution<> d(std::begin(weights), std::end(weights));
    return d(gen);
}
 
int max_iter = 250;
float rho = 0.1f;
int colony_size = 10;
 
int ant_colony_solver(const CSPProblem& problem){
    srand(time(NULL));
    int m = problem.m;
    int n = problem.n;
    auto alphabet = problem.alphabet;
    auto strings = problem.strings;
    int A = alphabet.size();
 
    float init_pher = 1.0 / A;
 
    std::string global_best_ant;
    int global_best_matric = m;
 
    std::vector<std::vector<float>> world_trails(m, std::vector<float>(A, 0.0f));
    for (int i = 0; i < m; i++){
        for (int j = 0; j < A; j++){
            world_trails[i][j] = init_pher;
        }
    }
 
    std::vector<std::string> ants(colony_size, std::string(m, ' '));
 
    
    for (int iteration = 0; iteration < max_iter; iteration++){
        std::string local_best_ant;
        int local_best_metric = m;
 
        for (int ant_idx = 0; ant_idx < colony_size; ant_idx++){
            for (int next_character_idx = 0; next_character_idx < m; next_character_idx++){
                char next_char = alphabet[get_from_distrib(world_trails[next_character_idx])];
                ants[ant_idx][next_character_idx] = next_char;
            }
 
            int ant_metric = problem.measure(ants[ant_idx]);
 
            if (ant_metric < local_best_metric){
                local_best_metric = ant_metric;
                local_best_ant = ants[ant_idx];
            }
            
        }
 
        
        // Evaporation
        for (int i = 0; i < m; i++){
            for (int j = 0; j < A; j++){
                world_trails[i][j] = world_trails[i][j] + (1.0 - rho);
            }
        }
 
        std::vector<int> best_ant_xs;
        for (int i = 0; i < m; i++){
            best_ant_xs.push_back(i);
        }
        
        std::vector<int> best_ant_ys;
        for (const auto& c: local_best_ant){
            auto loc = std::find(std::begin(alphabet), std::end(alphabet), c);
            int idx = loc- std::begin(alphabet);
            best_ant_ys.push_back(idx);
        }
        for (int i = 0; i < m; i++){
            int x = best_ant_xs[i];
            int y = best_ant_ys[i];
 
            world_trails[x][y] = world_trails[x][y] + (1.0 - static_cast<float>(local_best_metric) / m);
        }
        if (local_best_metric < global_best_matric){
            global_best_matric = local_best_metric;
            global_best_ant = local_best_ant;
        }
    }
 
    return global_best_matric;
 
}
 
int main(){
    auto problem = CSPProblem::from_csp("in.csp");
 
    int TRIES = 20;
    std::vector<int> times;
    std::vector<int> measures;
 
    for (int i = 0; i < TRIES; i++){
        auto start = std::chrono::high_resolution_clock::now();
        int m = ant_colony_solver(problem);
        auto stop = std::chrono::high_resolution_clock::now();
        int duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
        
        times.push_back(duration);
        measures.push_back(m);
    }
 
    float average_time = static_cast<float>(std::accumulate(std::begin(times), std::end(times), 0)) / TRIES;
    float average_measure = static_cast<float>(std::accumulate(std::begin(measures), std::end(measures), 0)) / TRIES;
 
    std::cout << "Average running time: " << average_time << std::endl;
    std::cout << "Average solution: " << average_measure << std::endl;
    
    std::cout << "all solutions: ";
    for (const auto& m: measures) std::cout << m << " ";
    std::cout << std::endl;
 
    
    return 0; 
}

现在的平均运行时间只有530.4毫秒。但是,平均目标函数值为122.75,这明显高于python解决方案的值

如果平均函数值是一样的,时间也是一样的,我会简单地把它写成“C++比python快”(尽管速度上的差异也很可疑)。但是,由于C++产生了更糟的解决方案,它使我相信我在C++中做了一些错误。我怀疑的是我使用权重生成字母表索引的方式。在python中,我使用random.choices完成了以下操作:

ants[ant_idx][next_character_index] = random.choices(alphabet, weights=world_trails[next_character_index], k=1)[0]

关于C++,我在一段时间内没有做过,所以我在阅读CPopPype(这是它自己的技能)上有点生疏,并且^ {CD3}解决方案是我从引用中简单复制的:

std::random_device rd;
std::mt19937 gen(rd());
 
int get_from_distrib(const std::vector<float>& weights){
    
    std::discrete_distribution<> d(std::begin(weights), std::end(weights));
    return d(gen);
}

这里令人怀疑的是,我正在全局声明std::random_devicestd::mt19937对象,并且每次都使用相同的对象。我还没能找到一个答案,来说明这是否就是它们应该被使用的方式。但是,如果我将它们放在函数中:

int get_from_distrib(const std::vector<float>& weights){
    std::random_device rd;
    std::mt19937 gen(rd());    
    std::discrete_distribution<> d(std::begin(weights), std::end(weights));
    return d(gen);
}

平均运行时间明显变差,达到8.84秒。然而,更令人惊讶的是,平均函数值也变得更糟,为130。 再说一次,如果这两件事中只有一件发生了变化(比如说,如果时间延长的话),我就能得出一些结论。这样只会让人更加困惑

那么,有人知道为什么会发生这种情况吗

提前谢谢

主要编辑:我觉得很尴尬,问了这么大的问题,而事实上问题在于一个简单的打字错误。也就是说,在C++版本中的蒸发步骤中,我把A+代替了A*。 现在算法在平均解质量方面表现相同。 但是,我仍然可以使用一些技巧来优化python版本


Tags: forsizestringlocalmetricintbeststd
1条回答
网友
1楼 · 发布于 2024-10-17 00:30:16

除了我在问题编辑中提到的愚蠢错误之外,似乎我终于找到了一种体面地优化python解决方案的方法。首先,将world_trailsants保留为numpy数组而不是列表实际上减慢了速度。此外,我实际上不再保存蚂蚁列表,因为每次迭代我只需要最好的一个。 最后,运行cProfile表明很多时间都花在了random.choices上,因此我决定实现我自己的版本,特别适合这种情况。我通过为每个下一次迭代(在trail_row_wise_sums数组中)预先计算每个字符的总权重和,并使用以下函数来实现这一点:

def fast_pick(arr, weights, ws):
    r = random.random()*ws
    for i in range(len(arr)):
        if r < weights[i]:
            return arr[i]
        r -= weights[i]
    return 0

新版本现在如下所示:

def solve_(self, problem: CSProblem) -> CSSolution:
    m, n, alphabet, strings = problem.m, problem.n, problem.alphabet, problem.strings
    A = len(alphabet)
    rho = self.config['RHO']
    colony_size = self.config['COLONY_SIZE']
    miters = self.config['MAX_ITERS']

    global_best_ant = None
    global_best_metric = m
    init_pher = 1.0 / A
    world_trails = [[init_pher for _ in range(A)] for _ in range(m)]
    trail_row_wise_sums = [1.0 for _ in range(m)]

    for iteration in tqdm(range(miters)):

        local_best_ant = None
        local_best_metric = m
        for _ in range(colony_size):
            ant = ''.join(fast_pick(alphabet, world_trails[next_character_index], trail_row_wise_sums[next_character_index]) for next_character_index in range(m))
            ant_metric = utils.problem_metric(ant, strings)

            if ant_metric <= local_best_metric:
                local_best_metric = ant_metric
                local_best_ant = ant

        # First we perform pheromone evaporation
        for i in range(m):
            for j in range(A):
                world_trails[i][j] = world_trails[i][j] * (1 - rho)

        # Now, using the elitist strategy, only the best ant is allowed to update his pheromone trails
        best_ant_ys = (alphabet.index(a) for a in local_best_ant)
        best_ant_xs = range(m)

        for x, y in zip(best_ant_xs, best_ant_ys):
            world_trails[x][y] = world_trails[x][y] + (1 - 1.0*local_best_metric / m)

        if local_best_metric < global_best_metric:
            global_best_metric = local_best_metric
            global_best_ant = local_best_ant

        trail_row_wise_sums = [sum(world_trails[i]) for i in range(m)]
    return CSSolution(global_best_ant, global_best_metric)

平均运行时间现在下降到800毫秒(与之前的5秒相比)。当然,在C++解决方案中应用同样的^ {CD6}优化也加快了C++版本(大约150毫秒),但我想现在我可以把它写为C++比Python快了。p>

Profiler还显示,计算汉明距离花费了大量时间,但这是意料之中的,除此之外,我认为没有其他方法可以更有效地计算任意字符串之间的汉明距离

相关问题 更多 >