这可能是一个很长的问题,我提前道歉
我正在做一个项目,目标是为最接近的字符串问题研究不同的解决方案
Let s_1, ... s_n be strings of length m. Find a string s of length m such that it minimizes max{d(s, s_i) | i = 1, ..., n}, where d is the hamming distance.
已经尝试过的一种解决方案是使用蚁群优化,如here所述
这篇论文本身并没有讨论实现细节,所以我在效率方面已经尽了最大的努力。然而,效率并不是唯一不寻常的行为
我不确定这样做是否是常见的做法,但我将通过pastebin展示我的代码,因为我相信如果我直接把它放在这里,它会淹没线程。如果这是一个问题,我不介意编辑线程直接放在这里 正如我之前试验过的所有算法一样,我最初用python编写了这个算法。代码如下:
def solve_(self, problem: CSProblem) -> CSSolution:
m, n, alphabet, strings = problem.m, problem.n, problem.alphabet, problem.strings
A = len(alphabet)
rho = self.config['RHO']
colony_size = self.config['COLONY_SIZE']
global_best_ant = None
global_best_metric = m
ants = np.full((colony_size, m), '')
world_trails = np.full((m, A), 1 / A)
for iteration in range(self.config['MAX_ITERS']):
local_best_ant = None
local_best_metric = m
for ant_idx in range(colony_size):
for next_character_index in range(m):
ants[ant_idx][next_character_index] = random.choices(alphabet, weights=world_trails[next_character_index], k=1)[0]
ant_metric = utils.problem_metric(ants[ant_idx], strings)
if ant_metric < local_best_metric:
local_best_metric = ant_metric
local_best_ant = ants[ant_idx]
# First we perform pheromone evaporation
for i in range(m):
for j in range(A):
world_trails[i][j] = world_trails[i][j] * (1 - rho)
# Now, using the elitist strategy, only the best ant is allowed to update his pheromone trails
best_ant_ys = (alphabet.index(a) for a in local_best_ant)
best_ant_xs = range(m)
for x, y in zip(best_ant_xs, best_ant_ys):
world_trails[x][y] = world_trails[x][y] + (1 - local_best_metric / m)
if local_best_metric < global_best_metric:
global_best_metric = local_best_metric
global_best_ant = local_best_ant
return CSSolution(''.join(global_best_ant), global_best_metric)
utils.problem_metric
函数如下所示:
def hamming_distance(s1, s2):
return sum(c1 != c2 for c1, c2 in zip(s1, s2))
def problem_metric(string, references):
return max(hamming_distance(string, r) for r in references)
我已经看到,有很多调整和其他参数,你可以添加到ACO,但我一直保持简单,现在。我使用的配置是250次迭代,蚁群大小为10只蚂蚁,rho=0.1。我测试它的问题就在这里:http://tcs.informatik.uos.de/research/csp_cssp,一个叫做2-10-250-1-0.csp(第一个)。字母表仅由“0”和“1”组成,字符串长度为250,总共有10个字符串
对于我提到的ACO配置,使用python解算器,这个问题平均在5秒内得到解决,平均目标函数值为108.55(模拟20次)。正确的目标函数值为96。具有讽刺意味的是,与我第一次尝试实现此解决方案时的平均值相比,5秒的平均值是不错的。然而,它仍然惊人地慢
在进行各种优化之后,我决定尝试在C++中实现完全相同的解决方案,这样就可以看出运行时间之间是否存在显著差异。下面是C++解决方案:
#include <iostream>
#include <vector>
#include <algorithm>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <sstream>
#include <string>
#include <random>
#include <chrono>
#include <map>
class CSPProblem{
public:
int m;
int n;
std::vector<char> alphabet;
std::vector<std::string> strings;
CSPProblem(int m, int n, std::vector<char> alphabet, std::vector<std::string> strings)
: m(m), n(n), alphabet(alphabet), strings(strings)
{
}
static CSPProblem from_csp(std::string filepath){
std::ifstream file(filepath);
std::string line;
std::vector<std::string> input_lines;
while (std::getline(file, line)){
input_lines.push_back(line);
}
int alphabet_size = std::stoi(input_lines[0]);
int n = std::stoi(input_lines[1]);
int m = std::stoi(input_lines[2]);
std::vector<char> alphabet;
for (int i = 3; i < 3 + alphabet_size; i++){
alphabet.push_back(input_lines[i][0]);
}
std::vector<std::string> strings;
for (int i = 3 + alphabet_size; i < input_lines.size(); i++){
strings.push_back(input_lines[i]);
}
return CSPProblem(m, n, alphabet, strings);
}
int hamm(const std::string& s1, const std::string& s2) const{
int h = 0;
for (int i = 0; i < s1.size(); i++){
if (s1[i] != s2[i])
h++;
}
return h;
}
int measure(const std::string& sol) const{
int mm = 0;
for (const auto& s: strings){
int h = hamm(sol, s);
if (h > mm){
mm = h;
}
}
return mm;
}
friend std::ostream& operator<<(std::ostream& out, CSPProblem problem){
out << "m: " << problem.m << std::endl;
out << "n: " << problem.n << std::endl;
out << "alphabet_size: " << problem.alphabet.size() << std::endl;
out << "alphabet: ";
for (const auto& a: problem.alphabet){
out << a << " ";
}
out << std::endl;
out << "strings:" << std::endl;
for (const auto& s: problem.strings){
out << "\t" << s << std::endl;
}
return out;
}
};
std::random_device rd;
std::mt19937 gen(rd());
int get_from_distrib(const std::vector<float>& weights){
std::discrete_distribution<> d(std::begin(weights), std::end(weights));
return d(gen);
}
int max_iter = 250;
float rho = 0.1f;
int colony_size = 10;
int ant_colony_solver(const CSPProblem& problem){
srand(time(NULL));
int m = problem.m;
int n = problem.n;
auto alphabet = problem.alphabet;
auto strings = problem.strings;
int A = alphabet.size();
float init_pher = 1.0 / A;
std::string global_best_ant;
int global_best_matric = m;
std::vector<std::vector<float>> world_trails(m, std::vector<float>(A, 0.0f));
for (int i = 0; i < m; i++){
for (int j = 0; j < A; j++){
world_trails[i][j] = init_pher;
}
}
std::vector<std::string> ants(colony_size, std::string(m, ' '));
for (int iteration = 0; iteration < max_iter; iteration++){
std::string local_best_ant;
int local_best_metric = m;
for (int ant_idx = 0; ant_idx < colony_size; ant_idx++){
for (int next_character_idx = 0; next_character_idx < m; next_character_idx++){
char next_char = alphabet[get_from_distrib(world_trails[next_character_idx])];
ants[ant_idx][next_character_idx] = next_char;
}
int ant_metric = problem.measure(ants[ant_idx]);
if (ant_metric < local_best_metric){
local_best_metric = ant_metric;
local_best_ant = ants[ant_idx];
}
}
// Evaporation
for (int i = 0; i < m; i++){
for (int j = 0; j < A; j++){
world_trails[i][j] = world_trails[i][j] + (1.0 - rho);
}
}
std::vector<int> best_ant_xs;
for (int i = 0; i < m; i++){
best_ant_xs.push_back(i);
}
std::vector<int> best_ant_ys;
for (const auto& c: local_best_ant){
auto loc = std::find(std::begin(alphabet), std::end(alphabet), c);
int idx = loc- std::begin(alphabet);
best_ant_ys.push_back(idx);
}
for (int i = 0; i < m; i++){
int x = best_ant_xs[i];
int y = best_ant_ys[i];
world_trails[x][y] = world_trails[x][y] + (1.0 - static_cast<float>(local_best_metric) / m);
}
if (local_best_metric < global_best_matric){
global_best_matric = local_best_metric;
global_best_ant = local_best_ant;
}
}
return global_best_matric;
}
int main(){
auto problem = CSPProblem::from_csp("in.csp");
int TRIES = 20;
std::vector<int> times;
std::vector<int> measures;
for (int i = 0; i < TRIES; i++){
auto start = std::chrono::high_resolution_clock::now();
int m = ant_colony_solver(problem);
auto stop = std::chrono::high_resolution_clock::now();
int duration = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
times.push_back(duration);
measures.push_back(m);
}
float average_time = static_cast<float>(std::accumulate(std::begin(times), std::end(times), 0)) / TRIES;
float average_measure = static_cast<float>(std::accumulate(std::begin(measures), std::end(measures), 0)) / TRIES;
std::cout << "Average running time: " << average_time << std::endl;
std::cout << "Average solution: " << average_measure << std::endl;
std::cout << "all solutions: ";
for (const auto& m: measures) std::cout << m << " ";
std::cout << std::endl;
return 0;
}
现在的平均运行时间只有530.4毫秒。但是,平均目标函数值为122.75,这明显高于python解决方案的值
如果平均函数值是一样的,时间也是一样的,我会简单地把它写成“C++比python快”(尽管速度上的差异也很可疑)。但是,由于C++产生了更糟的解决方案,它使我相信我在C++中做了一些错误。我怀疑的是我使用权重生成字母表索引的方式。在python中,我使用random.choices
完成了以下操作:
ants[ant_idx][next_character_index] = random.choices(alphabet, weights=world_trails[next_character_index], k=1)[0]
关于C++,我在一段时间内没有做过,所以我在阅读CPopPype(这是它自己的技能)上有点生疏,并且^ {CD3}解决方案是我从引用中简单复制的:
std::random_device rd;
std::mt19937 gen(rd());
int get_from_distrib(const std::vector<float>& weights){
std::discrete_distribution<> d(std::begin(weights), std::end(weights));
return d(gen);
}
这里令人怀疑的是,我正在全局声明std::random_device
和std::mt19937
对象,并且每次都使用相同的对象。我还没能找到一个答案,来说明这是否就是它们应该被使用的方式。但是,如果我将它们放在函数中:
int get_from_distrib(const std::vector<float>& weights){
std::random_device rd;
std::mt19937 gen(rd());
std::discrete_distribution<> d(std::begin(weights), std::end(weights));
return d(gen);
}
平均运行时间明显变差,达到8.84秒。然而,更令人惊讶的是,平均函数值也变得更糟,为130。 再说一次,如果这两件事中只有一件发生了变化(比如说,如果时间延长的话),我就能得出一些结论。这样只会让人更加困惑
那么,有人知道为什么会发生这种情况吗
提前谢谢
主要编辑:我觉得很尴尬,问了这么大的问题,而事实上问题在于一个简单的打字错误。也就是说,在C++版本中的蒸发步骤中,我把A+代替了A*。 现在算法在平均解质量方面表现相同。 但是,我仍然可以使用一些技巧来优化python版本
除了我在问题编辑中提到的愚蠢错误之外,似乎我终于找到了一种体面地优化python解决方案的方法。首先,将
world_trails
和ants
保留为numpy数组而不是列表实际上减慢了速度。此外,我实际上不再保存蚂蚁列表,因为每次迭代我只需要最好的一个。 最后,运行cProfile
表明很多时间都花在了random.choices
上,因此我决定实现我自己的版本,特别适合这种情况。我通过为每个下一次迭代(在trail_row_wise_sums
数组中)预先计算每个字符的总权重和,并使用以下函数来实现这一点:新版本现在如下所示:
平均运行时间现在下降到800毫秒(与之前的5秒相比)。当然,在C++解决方案中应用同样的^ {CD6}优化也加快了C++版本(大约150毫秒),但我想现在我可以把它写为C++比Python快了。p>
Profiler还显示,计算汉明距离花费了大量时间,但这是意料之中的,除此之外,我认为没有其他方法可以更有效地计算任意字符串之间的汉明距离
相关问题 更多 >
编程相关推荐