0. deepmd
使用案例路径:海康>A>A-6工作>deepkit使用案例
1.机器学习
1.安装:V3 版本
1.git clone -b v3 https://gitlab.com/Herbbb/torchdemo.git ****
2. source env_torchdemo (optional)
3.安装各种库,修改cmakelist/prifix.~前几行
**completer:**编译器
4.makedir build
cd build
cmake **../**cmakelist
make
5.编译和lammps的接口
把lammps的文件包复制过来,然后sh ~.sh build 18 (18是主节点核数,可以通过lscps 查看cpus的值获得)
2.训练势基本流程
1.从OUTCAR构建训练集,xsf格式,可直接在xcrystal 打开
sampleing.py
修改inv2
traindatapath = ./train-set*
batchsize = 1 和原子数相乘超过200
device = cpu
printstep = 100
hasattention = false 加不加注意力
训练
export OMP_NUM_THREADS=12 多线程
./
main.mp
–train inv2 (是main.mp而不是lam.mpi)测试(修改inv2的测试部分)
./main.mp –eval inv2 保证能量相差几个meV,力相差几十meV,输出的单位是eV
注意:线程和核,48核是虚拟核,实际只有24物理核,提交一个6线程任务就会占据6个核
训练机器学习势需要测试进程数,并不是越多越好,比如4远远好于48
注意二:楼下集群做训练的速度太慢,最好不要用楼下集群训练
计算 (记得训练用的什么样的inv2计算就要用什么样子的inv2)
。否则会报错,无法运行可以在训练过程中增加训练集,只要注意名字别和最开始的训练集重复
3.机器学习取样脚本
1. 按照目录来提取
import os
import sys
import time
import scipy.stats as st
import numpy as np
from sklearn.neighbors import KernelDensity
from matplotlib import pyplot
def timer(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"Time elapsed: {end_time - start_time:.4f} seconds")
return result
return wrapper
def find_files(directory):
"""
递归查找一个目录下所有叫OUTCAR的文件,并返回这些文件的完整路径
"""
outcar_files = [] # 保存所有叫OUTCAR的文件的路径
for root, dirs, files in os.walk(directory):
for file in files:
if file == "OUTCAR":
outcar_files.append(os.path.join(root, file))
for dir in dirs:
dir_outcar_files = find_files(os.path.join(root, dir))
outcar_files.extend(dir_outcar_files)
# 将子目录添加到 dirs 中,以便递归处理
dirs[:] = [d for d in dirs if os.path.isdir(os.path.join(root, d))]
outcar_files = list(set(outcar_files))
return outcar_files
@timer
def read_vasp_xdatcar(file_path):
f = open(file=file_path)
f.readline()
f.readline()
lat = []
pos = []
# lattice
for _ in range(3):
lat.append(f.readline())
ele = f.readline()
npele = f.readline()
cnt = 0
tmp = None
while True:
line = f.readline()
if "configuration" in line:
cnt += 1
if tmp is not None:
pos.append(tmp)
tmp = []
continue
if not line:
pos.append(tmp)
break
tmp.append(line)
for conf_i in range(len(pos)):
for atom_i in range(len(pos[conf_i])):
pos[conf_i][atom_i] = pos[conf_i][atom_i].split()
pos[conf_i] = np.array(pos[conf_i]).astype(float)
ele = ele.split()
npele = npele.split()
npele = [int(i) for i in npele]
for i in range(len(lat)):
lat[i] = lat[i].split()
lat = np.array(lat).astype(float)
return cnt, ele, npele, lat, pos
def topn(arr, n):
indices = np.argpartition(arr, -n)[-n:]
values = arr[indices]
sort_indices = np.argsort(values)[::-1]
return values[sort_indices], indices[sort_indices]
def split_alp_num(string):
for i in range(len(string)):
if string[i].isdigit():
return [string[:i]], int(string[i:])
def write2my(file_path, ene_i, lat_i, ele_i, coo_i, foc_i):
lat_i = lat_i.reshape(3, 3)
coo_i = coo_i.reshape(-1, 3)
foc_i = foc_i.reshape(-1, 3)
with open(file_path, 'w') as file:
file.write(f"# total energy = {ene_i} eV\n\n")
file.write("CRYSTAL\n")
file.write("PRIMVEC\n")
for j in lat_i:
for k in j:
file.write(f'{k:20.8f}')
file.write('\n')
file.write("PRIMCOORD\n")
file.write(f"{len(coo_i)} 1\n")
for j in range(len(coo_i)):
file.write(f'{ele_i[j]:2}')
# coo
for k in coo_i[j]:
file.write(f"{k:20.8f}")
# force
for k in foc_i[j]:
file.write(f"{k:20.8f}")
file.write("\n")
pass
def direct2pos(lat, pos, direction=True):
"""
lat = [[Ax, Ay, Az],
[Bx, By, Bz],
[Cx, Cy, Cz]]
Pos = [n, 3]
:return: direct = [n, 3]
if direction=True
"""
if direction:
return pos @ lat
else:
return pos @ np.linalg.inv(lat)
class MyDistrib(st.rv_continuous):
def __init__(self, prdf):
super().__init__()
self.prdf = prdf
def _pdf(self, x, *args):
return self.prdf.evaluate(x)
class RejectSamp:
def __init__(self, ocr_name: str):
self.ocr_name = ocr_name # {XDATCAR path: OUTCAR path}
self.rng_min = 0
self.rng_max = None
self.rng_grid = 10000
self.rng = None
self.rng_intv = None
self.pdf = None
self.sum_intv = 10
self.envelope = None
self.samples = None
pass
def sampling(self, num):
nframe, natom, ele, ene, lat, pos, foc = self._anls_ocr()
print(nframe)
print(natom)
# print(ele)
print(ene.shape)
print(lat.shape)
print(pos.shape)
print(foc.shape)
self.rng_max = nframe
self.rng = np.linspace(self.rng_min, self.rng_max, self.rng_grid)
self.rng_intv = self.rng[1] - self.rng[0]
sd = np.sum((pos[1:] - pos[:-1]) ** 2, (1, 2))
nintv = sd.size // self.sum_intv
sum_sd = sd[:nintv * self.sum_intv].reshape(nintv, self.sum_intv).sum(1)
sum_sd_body = np.copy(sum_sd)
sum_sd_body[:sum_sd.size // 20] = sum_sd.min()
topn_val, topn_idx = topn(sum_sd_body, 100)
x_train = topn_idx * self.sum_intv
distrib = KernelDensity(kernel="gaussian", bandwidth=int(0.003 * nframe)).fit(x_train[..., None])
self.pdf = lambda x: np.exp(distrib.score_samples(np.reshape(x, [-1, 1])))
self.envelope = self.pdf(self.rng).max()
cnt = 0
iters = 0
res = np.array([])
while cnt < num:
u = np.random.uniform(self.rng_min, self.rng_max, num - cnt)
y = np.random.uniform(0, self.envelope, num - cnt)
pd = self.pdf(u)
res = np.append(res, u[pd > y])
res = res.astype(int)
res = np.unique(res)
cnt = len(res)
iters += 1
print(f"{iters:<3}{cnt:6}")
print("Total iters: ", iters)
self.samples = res
return self.samples.size, natom, ele, ene[self.samples], \
lat[self.samples], pos[self.samples], foc[self.samples]
def plot(self):
fig, ax = pyplot.subplots(figsize=(10, 6))
ax.hist(self.samples, bins=int(self.samples.size / 8), density=True, alpha=0.5)
ax.plot(self.rng, self.pdf(self.rng), 'r-', lw=1)
ax.plot(self.samples, -0.01 * self.envelope - 0.03 * self.envelope * np.random.random(self.samples.size), "+k")
ax.set_ylim(-self.envelope * 0.05)
ax.set_title(self.ocr_name + " PDF")
return fig
@timer
def _anls_ocr(self):
pos_foc = []
ene = []
lat = []
ions_per = []
types = []
print(self.ocr_name)
with open(self.ocr_name, 'r') as file:
while True:
line = file.readline()
# eof break
if not line:
break
# ions per type
#if "POSCAR" in line:
# ions_per.append(line)
# continue
if "ions per type" in line:
ions_per.append(line)
if "VRHFIN" in line:
types.append(line)
# lat
if "BASIS-vectors" in line:
file.readline()
file.readline()
file.readline()
file.readline()
for i in range(3):
line = file.readline()
lat.append(line)
continue
# pos & force
if "TOTAL-FORCE (eV/Angst)" in line:
line = file.readline()
while True:
line = file.readline()
if not line:
break
if "------" in line:
break
pos_foc.append(line)
continue
# ene
if "energy without entropy" in line:
ene.append(line)
ene = np.array([i.split('=')[1].split()[0] for i in ene]).astype(np.float64)
# lat = np.array([i.split() for i in lat]).astype(np.float64)[:, :3].reshape(-1, 3, 3)
lat = np.array([[float(i[4:][:12]), float(i[4:][12:25]), float(i[4:][25:38])] for i in lat]).astype(np.float64)[:, :3].reshape(-1, 3, 3)
# ele pretreatment
types = [i.split('=')[1].split(':')[0].rstrip() for i in types]
ions_per = ions_per[0].split()[-4:]
ele = sum([[i] * int(j) for i, j in zip(types, ions_per)], [])
pos_foc = np.array([i.split() for i in pos_foc]).astype(np.float64)
nframe = len(ene)
natom = len(ele)
pos = pos_foc[:, :3][:nframe * natom].reshape(nframe, natom, 3)[:nframe]
foc = pos_foc[:, 3:][:nframe * natom].reshape(nframe, natom, 3)[:nframe]
lat = lat[:nframe]
return nframe, natom, ele, ene, lat, pos, foc
if __name__ == '__main__':
# ocr_files = find_files("/data/ljx/third/lizhao/LYSO/LYSO-src/old")
#训练结构路径
ocr_files = ["/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/1fs/2",
"/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/1fs/4",
"/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/2fs/2",
"/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/2fs/4"]
#ocr_files = ["/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/triple/1",
# "/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/triple/2"]
#训练集输出路径
out_path = "/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/"
if not os.path.exists(out_path):
os.mkdir(out_path)
out_data_path = os.path.join(out_path, "eval-set")
if not os.path.exists(out_data_path):
os.mkdir(out_data_path)
#每个数据取多少点,大概相乘在10000附近
smp_per_ocr = 1500
# mode = 'rand'
mode = 'kde'
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Be careful when changing the code below
print(f"total trajectories: {len(ocr_files)}")
if not os.path.exists(out_data_path):
os.mkdir(out_data_path)
xsf_tot_cnt = 0
if mode == 'kde':
for ocr_i in range(len(ocr_files)):
r = RejectSamp(ocr_files[ocr_i])
n_frame, n_atom, ele, ene, lat, pos, foc = r.sampling(smp_per_ocr)
fig = r.plot()
fig_path = os.path.join(out_path, str(ocr_i))
fig.savefig(fig_path)
for j in range(n_frame):
path_j = os.path.join(out_data_path, f'structure{str(smp_per_ocr * ocr_i + j).zfill(5)}.xsf')
ene_j = ene[j]
lat_j = lat[j]
ele_j = ele
coo_j = pos[j]
foc_j = foc[j]
write2my(path_j, ene_j, lat_j, ele_j, coo_j, foc_j)
xsf_tot_cnt += 1
2. 从单个目录下递归直接提取全部
注意root_dir和write_dir的位置,其中root_dir放到要提取的当前目录最好,并且使用相对路径,这样提取的文件名称比较短
而write_dir则需要新建一个目录来存储提取的结构
import os
from ase.io import read
from ase import Atoms
def write2my(file_path, ene_i, lat_i, ele_i, coo_i, foc_i, vir_i=None):
lat_i = lat_i.reshape(3, 3)
coo_i = coo_i.reshape(-1, 3)
foc_i = foc_i.reshape(-1, 3)
with open(file_path, 'w') as file:
file.write(f"# total energy = {ene_i} eV\n\n")
if vir_i is not None:
file.write(f"VIRIAL\n")
for i in vir_i:
file.write(f'{i:20.8f}')
file.write("\n")
file.write("CRYSTAL\n")
file.write("PRIMVEC\n")
for j in lat_i:
for k in j:
file.write(f'{k:20.8f}')
file.write('\n')
file.write("PRIMCOORD\n")
file.write(f"{len(coo_i)} 1\n")
for j in range(len(coo_i)):
file.write(f'{ele_i[j]:2}')
# coo
for k in coo_i[j]:
file.write(f"{k:20.8f}")
# force
for k in foc_i[j]:
file.write(f"{k:20.8f}")
file.write("\n")
pass
def read_from_ase_atoms(atoms: Atoms):
ene = atoms.get_potential_energy()
lat = atoms.get_cell()
pos = atoms.get_positions()
foc = atoms.get_forces()
try:
sts = atoms.get_stress()
xx, yy, zz, yz, xz, xy = - sts * atoms.get_volume()
vir = np.array(
[[xx, xy, xz],
[xy, yy, yz],
[xz, yz, zz]]).reshape(-1)
except:
vir = None
ele = atoms.get_chemical_symbols()
return ene, lat, pos, foc, vir, ele
def find_files(directory, filename):
result = []
for root, dirs, files in os.walk(directory):
for file in files:
if file == filename:
result.append(os.path.join(root, file))
return result
if __name__ == '__main__':
root_dir = "./"
outcar_dir = find_files(root_dir, "OUTCAR")
write_dir = "/work/home/xieyu/workplace/liz/2-Li-Y-Si-O-600K/B-升温/data_xsf"
for i in outcar_dir:
xsf_head = '_'.join(i.split('.')[-1].split('/'))
print(xsf_head)
datas = read(i, index=":")
for n, i in enumerate(datas):
ene, lat, pos, foc, vir, ele = read_from_ase_atoms(i)
write2my(
os.path.join(write_dir, f"{xsf_head}_struct{str(n).zfill(6)}.xsf"),
ene_i=ene,
lat_i=lat,
ele_i=ele,
coo_i=pos,
foc_i=foc,
vir_i=vir)
4.遇到的报错、问题
晶胞中某种原子的个数少,导致小的截断半径探索不到足够的数据,需要增加截断半径
解决方法:增加rcut和acut
找师弟
取样脚本报错,原因是中文名称路径
5.机器学习训练增加对压力的训练
- 在从OUTCAR中提取出的
- 在inv2中vpref的值不能为0,可以设置为与epref相等。
6. inv2中的元素类型的顺序
pair_coeff应该和初始的data.nso或者初始的POSCAR的原子顺序相同,而不是按照原子序数的大小变化
2. 主动学习机器学习
如果给主动学习一个很大的初始训练集,会降低主动学习的速度,原因是主动学习需要遍历很多的结构
初始训练集太少会导致一些较少的原子检测不到,会报错
1.注意事项
1. 老版本
1 关于显卡
gtop查看显卡信息,注意别把显存提爆
占用显存大小和胞的大小有关,和数据集数量无关
2 关于主动学习
对每个主动学习任务,需要在集群处建立新的文件夹,来进行自洽计算,避免多个任务进行时发生‘哈希碰撞’
修改下面文件中的路径
cal.slurm里提任务的名称要不同,因为是依据任务名称来回收任务的
记得激活python3环境(不然不写新找到的数据集,现在修复了)
修改使用哪个卡
可以把多个任务提交到一块卡上面
如果是单独只训练需要把restart修改为false
不能把poscar放到fp里面,否则会反复lable 一个结构
任务输出的log文件要统一命名 3-md-log(lammps分子动力学输出)
2-train-log(训练输出)
4-select-log (选择输出)
vasp.log vasp输出
训练的圈数不需要太多 3000-5000
初始训练集的数目100左右
acnn的路径(sub-train.sh sub-select.sh)要正确
inv2 里rcut等
注意sub-md.sh和sub.slurm里面的任务名称不但不同任务要不同,而且要与原来集群上的任务名称不同
waiting remote jobs: 1 含义是有一个等待回收的任务
lr是学习率,学习率的值减小,说明模型的变化量减小
inv2中 select_thr 控制挑选结构的下限阈值。目的是挑选出结构差异性大的结构,增大sub-select.log中target的值(一个矩阵的行列式,不同行差异性越大行列式越大)。初期因为跑的少,很多相似结构,所以阈值要高点,不能什么都加进去,后期跑的多了,可以把阈值调低点,增加多样性,
现在推荐把select_thr设置为1.2
3.需要测试的事项
- 初始数据集的大小多少更合适
- 训练的圈数需要随着主动学习次数的增加而变化,多少圈比较合适
- 小包跑多少算合理了
- 主动学习过程中,哪些是占资源多的,哪些是花费时间长的
- nohup ./sub-train.sh > train.log 2>&1 &
- inv2 里卡的名称
- in.lmp 里质量、名称
- 给一个data.nlso文件
4.训练方式-增强采样
先用小胞,小圈数 训练到一定程度后(小胞可以跑足够长时间),用大胞、大圈数
目的是增加训练集
可以循环着提供大胞的扩胞方式
inv2 nbatch和savestep要联动修改
增加胞的大小应该可以减小代数
top -u lijx
跑升温过程训练,这样得到的势理论上可以跑不同的温度
增强采样:大晶胞、训练圈数增加、广的温度采样范围、长的模拟时间
5. 关于续算
1.没有跑到设置条件(没有跑足1ns…)
删除最后一代
在scheduling.sh里修改代数,不从0开始,而是输入刚刚删除的代的代号,这样,会自动从前一代读取最后的模型,进行训练。在提交任务后,新的一代内训练输出2-train.log中会出现提示(需要向下多翻)
例如,跑完是 28 29 30 ,那就删除30的文件夹,同时把scheduling.sh里改为30
提交任务应该注意 >> log 否则续算会把之前的log覆盖掉 nohup ./scheduling.sh >> log 2>&1 &
可以适当修改分子动力学里的晶胞尺寸、模拟时间、训练圈数和保存的步数、训练用到的卡
2.跑到预定条件,但是希望续算
正常训练结束后log会报没有输出oos的错误,可以通过看最后一代3md的输出log.lammps判断
不需要删除最后一代,只需要修改scheduling.sh为下一代开始,例如,出现了 28 29 30 那么scheduling.sh里修改为 31
需要把前一代的Ap_inv_*拷贝过来,因为没有select的步骤没有办法产生这些文件,但是分子动力学计算需要这些文件
Ap_inv_*用于主动学习中选择结构,如果用al的模式跑分子动力学,需要用到Ap_inv*来作为选择结构的标准,但是分子动力学过程并没有用到。用不用al跑出来的结果是一样的,差别是要不要获取结构
可以不用al,也就不需要Ap_inv
修改in.lmp里面的模拟晶胞尺寸、模拟时间、以及升温过程的温度(更广一些)、增加训练和保存的圈数、等等方式来增加采样
还有修改 restart_lr = 1e-4 学习率
学习率1e-7训练就差别不大了
inv2中 select_thr 调低 1.2
6 批量化杀掉当前文件夹内后台的任务(不想继续跑的时候)
# 获取用户 lijx的所有进程 ID
pgrep -u lijx > lijx_processes.txt
# 通过进程 ID 获取对应的进程所在目录
while read -r pid; do
ls -l /proc/"$pid"/cwd 2>/dev/null | grep -q "/public/lijx/work/Na-X-Si-O/2-onthefly/3-gpu" && echo "$ pid"
done < lijx_processes.txt > lijx_processes_in_test.txt
# 通过进程 ID 杀死位于 /home/lijx/test 及其子目录下的所有进程
while read -r pid; do
kill -9 "$pid"
done < lijx_processes_in_test.txt
for i in $(cat lijx_processes_in_test.txt);do llpid $i;done
2.集成化提任务方式
四个文件 1-convert_outcar_xsf.py 2-convert_POSCAR_data.py 3-makepot.py make_initial.sh
只需要配置make_initial.sh
#需要的文件:POSCAR OUTCAR sub.slurm(vasp) sub-md.sh(lammps)
#先切换到python3环境 conda activate
#set vasp_systerm
vasp_folder='"/work/home/liz/workspace/1-system-MD/5-Na-X-Si-O/md/1-Mo"'
md_duty_name='md-Mo'
vasp_label_duty_name='vasp-Mo'
#set GPU
GPU_number=0
#set md
cat > in.lmp << lzh
variable x index 1
variable y index 1
variable z index 1
variable ss equal 0.001
variable tdamp equal "v_ss*100"
variable dmsdx equal c_1[1]
variable dmsdy equal c_1[2]
variable dmsdz equal c_1[3]
variable dmsd equal c_1[4]
variable istep equal step
variable msdx format dmsdx %16.8e
variable msdy format dmsdy %16.8e
variable msdz format dmsdz %16.8e
variable msd format dmsd %16.8e
variable sstep format istep %-10g
units metal
atom_style atomic
boundary p p p
read_data ./data.nso
mass 1 22.990 #important
mass 2 95.94
mass 3 28.084
mass 4 15.999
replicate \$x \$y \$z
pair_style acnn ./model/all al
pair_coeff * * Na Mo Si O #important
thermo 1
thermo_style custom step time temp press pe lx ly lz ke
thermo_modify format 4 %20.15g
compute 1 all msd com yes
dump out all custom 100 A.lammpstrj id element xu yu zu fx fy fz
dump_modify out element 1 2 3 4
dump_modify out format 1 %4d
dump_modify out format 2 %4s
dump_modify out format float %16.8f
velocity all create 800.0 8728
timestep \${ss}
fix 1 all nvt temp 800 800 \${tdamp}
fix 2 all print 1 "\${sstep} \${msdx} \${msdy} \${msdz} \${msd}" screen no file msd.dat
run 1000000
lzh
#not always need changes
vasp_ip='"liz@59.72.114.246"'
vasp_port='"2222"'
command_line='"slurm"'
# set vasp_input
vasp_pot_PBE_Path='"/public/lijx/work/Na-X-Si-O/pot-makepot/potpaw_PBE/"'
cat > INCAR << liz
SYSTEM = Na-X-Si-O
ALGO = Normal
ISYM = 0
LREAL =Auto
PREC = Normal
EDIFF = 1e-5
ENCUT = 400
NELMIN = 4
NELM = 500
IBRION = -1
ISIF = 2
ISMEAR = 0
SIGMA = 0.05
NPAR = 6
LH5 = .TRUE.
liz
cat > KPOINTS << lizh
K-Spacing Value to Generate K-Mesh: 0.06
0
Gamma
1 1 1
0.0 0.0 0.0
lizh
sed -i "s|^base_pseudo_path.*|base_pseudo_path = $vasp_pot_PBE_Path|" 3-makepot.py
mv ../POSCAR ../resource/fp
mv ../sub.slurm ../resource/fp
mv ../sub-md.sh ../resource/md
mv ./INCAR ../resource/fp
mv ./KPOINTS ../resource/fp
mv ./in.lmp ../resource/md
mv ../OUTCAR ../resource/init_dt
cp ./1-convert_outcar_xsf.py ../resource/init_dt
cp ./2-convert_POSCAR_data.py ../resource/fp
cp ./3-makepot.py ../resource/fp
cd ../resource/init_dt/
python 1-convert_outcar_xsf.py
wait
rm OUTCAR
mv eval-set/* ./
cd ../fp
python 2-convert_POSCAR_data.py POSCAR data.nso
wait
mv data.nso ../md
python 3-makepot.py
rm POSCAR
wait
if [ "$command_line"="slurm" ]; then
sed -i "s|^#SBATCH --job-name.*|#SBATCH --job-name=$vasp_label_duty_name|" sub.slurm
elif [ "$command_line" = "PBS" ]; then
sed -i "s|^#PBS -N|#PBS -N $vasp_label_duty_name|" sub.slurm
else
echo "Command line is not 'slurm' or 'PBS. Run another command."
fi
cd ../md
if [ "$command_line"="slurm" ]; then
sed -i "s|^#SBATCH --job-name.*|#SBATCH --job-name=$md_duty_name|" sub-md.sh
elif [ "$command_line" = "PBS" ]; then
sed -i "s|^#PBS -N|#PBS -N $md_duty_name|" sub-md.sh
else
echo "Command line is not 'slurm' or 'PBS. Run another command."
fi
cd ..
sed -i "9s|.*|remote_work_place=$vasp_folder|" iter_scheduling_remote.sh
sed -i "s|^remote_ip=.*|remote_ip=$vasp_ip|" iter_scheduling_remote.sh
sed -i "s|^remote_port=.*|remote_port=$vasp_port|" iter_scheduling_remote.sh
sed -i "s|^task_sys=.*|task_sys=$command_line|" iter_scheduling_remote.sh
sed -i "s|^device.*|device = cuda:$GPU_number|" inv2
1-
import os
import sys
import time
import scipy.stats as st
import numpy as np
from sklearn.neighbors import KernelDensity
from matplotlib import pyplot
def timer(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"Time elapsed: {end_time - start_time:.4f} seconds")
return result
return wrapper
def find_files(directory):
"""
递归查找一个目录下所有叫OUTCAR的文件,并返回这些文件的完整路径
"""
outcar_files = [] # 保存所有叫OUTCAR的文件的路径
for root, dirs, files in os.walk(directory):
for file in files:
if file == "OUTCAR":
outcar_files.append(os.path.join(root, file))
for dir in dirs:
dir_outcar_files = find_files(os.path.join(root, dir))
outcar_files.extend(dir_outcar_files)
# 将子目录添加到 dirs 中,以便递归处理
dirs[:] = [d for d in dirs if os.path.isdir(os.path.join(root, d))]
outcar_files = list(set(outcar_files))
return outcar_files
@timer
def read_vasp_xdatcar(file_path):
f = open(file=file_path)
f.readline()
f.readline()
lat = []
pos = []
# lattice
for _ in range(3):
lat.append(f.readline())
ele = f.readline()
npele = f.readline()
cnt = 0
tmp = None
while True:
line = f.readline()
if "configuration" in line:
cnt += 1
if tmp is not None:
pos.append(tmp)
tmp = []
continue
if not line:
pos.append(tmp)
break
tmp.append(line)
for conf_i in range(len(pos)):
for atom_i in range(len(pos[conf_i])):
pos[conf_i][atom_i] = pos[conf_i][atom_i].split()
pos[conf_i] = np.array(pos[conf_i]).astype(float)
ele = ele.split()
npele = npele.split()
npele = [int(i) for i in npele]
for i in range(len(lat)):
lat[i] = lat[i].split()
lat = np.array(lat).astype(float)
return cnt, ele, npele, lat, pos
def topn(arr, n):
indices = np.argpartition(arr, -n)[-n:]
values = arr[indices]
sort_indices = np.argsort(values)[::-1]
return values[sort_indices], indices[sort_indices]
def split_alp_num(string):
for i in range(len(string)):
if string[i].isdigit():
return [string[:i]], int(string[i:])
def write2my(file_path, ene_i, lat_i, ele_i, coo_i, foc_i):
lat_i = lat_i.reshape(3, 3)
coo_i = coo_i.reshape(-1, 3)
foc_i = foc_i.reshape(-1, 3)
with open(file_path, 'w') as file:
file.write(f"# total energy = {ene_i} eV\n\n")
file.write("CRYSTAL\n")
file.write("PRIMVEC\n")
for j in lat_i:
for k in j:
file.write(f'{k:20.8f}')
file.write('\n')
file.write("PRIMCOORD\n")
file.write(f"{len(coo_i)} 1\n")
for j in range(len(coo_i)):
file.write(f'{ele_i[j]:2}')
# coo
for k in coo_i[j]:
file.write(f"{k:20.8f}")
# force
for k in foc_i[j]:
file.write(f"{k:20.8f}")
file.write("\n")
pass
def direct2pos(lat, pos, direction=True):
"""
lat = [[Ax, Ay, Az],
[Bx, By, Bz],
[Cx, Cy, Cz]]
Pos = [n, 3]
:return: direct = [n, 3]
if direction=True
"""
if direction:
return pos @ lat
else:
return pos @ np.linalg.inv(lat)
class MyDistrib(st.rv_continuous):
def __init__(self, prdf):
super().__init__()
self.prdf = prdf
def _pdf(self, x, *args):
return self.prdf.evaluate(x)
class RejectSamp:
def __init__(self, ocr_name: str):
self.ocr_name = ocr_name # {XDATCAR path: OUTCAR path}
self.rng_min = 0
self.rng_max = None
self.rng_grid = 10000
self.rng = None
self.rng_intv = None
self.pdf = None
self.sum_intv = 10
self.envelope = None
self.samples = None
pass
def sampling(self, num):
nframe, natom, ele, ene, lat, pos, foc = self._anls_ocr()
print(nframe)
print(natom)
# print(ele)
print(ene.shape)
print(lat.shape)
print(pos.shape)
print(foc.shape)
self.rng_max = nframe
self.rng = np.linspace(self.rng_min, self.rng_max, self.rng_grid)
self.rng_intv = self.rng[1] - self.rng[0]
sd = np.sum((pos[1:] - pos[:-1]) ** 2, (1, 2))
nintv = sd.size // self.sum_intv
sum_sd = sd[:nintv * self.sum_intv].reshape(nintv, self.sum_intv).sum(1)
sum_sd_body = np.copy(sum_sd)
sum_sd_body[:sum_sd.size // 20] = sum_sd.min()
topn_val, topn_idx = topn(sum_sd_body, 100)
x_train = topn_idx * self.sum_intv
distrib = KernelDensity(kernel="gaussian", bandwidth=int(0.003 * nframe)).fit(x_train[..., None])
self.pdf = lambda x: np.exp(distrib.score_samples(np.reshape(x, [-1, 1])))
self.envelope = self.pdf(self.rng).max()
cnt = 0
iters = 0
res = np.array([])
while cnt < num:
u = np.random.uniform(self.rng_min, self.rng_max, num - cnt)
y = np.random.uniform(0, self.envelope, num - cnt)
pd = self.pdf(u)
res = np.append(res, u[pd > y])
res = res.astype(int)
res = np.unique(res)
cnt = len(res)
iters += 1
print(f"{iters:<3}{cnt:6}")
print("Total iters: ", iters)
self.samples = res
return self.samples.size, natom, ele, ene[self.samples], \
lat[self.samples], pos[self.samples], foc[self.samples]
def plot(self):
fig, ax = pyplot.subplots(figsize=(10, 6))
ax.hist(self.samples, bins=int(self.samples.size / 8), density=True, alpha=0.5)
ax.plot(self.rng, self.pdf(self.rng), 'r-', lw=1)
ax.plot(self.samples, -0.01 * self.envelope - 0.03 * self.envelope * np.random.random(self.samples.size), "+k")
ax.set_ylim(-self.envelope * 0.05)
ax.set_title(self.ocr_name + " PDF")
return fig
@timer
def _anls_ocr(self):
pos_foc = []
ene = []
lat = []
ions_per = []
types = []
print(self.ocr_name)
with open(self.ocr_name, 'r') as file:
while True:
line = file.readline()
# eof break
if not line:
break
# ions per type
#if "POSCAR" in line:
# ions_per.append(line)
# continue
if "ions per type" in line:
ions_per.append(line)
if "VRHFIN" in line:
types.append(line)
# lat
if "BASIS-vectors" in line:
file.readline()
file.readline()
file.readline()
file.readline()
for i in range(3):
line = file.readline()
lat.append(line)
continue
# pos & force
if "TOTAL-FORCE (eV/Angst)" in line:
line = file.readline()
while True:
line = file.readline()
if not line:
break
if "------" in line:
break
pos_foc.append(line)
continue
# ene
if "energy without entropy" in line:
ene.append(line)
ene = np.array([i.split('=')[1].split()[0] for i in ene]).astype(np.float64)
# lat = np.array([i.split() for i in lat]).astype(np.float64)[:, :3].reshape(-1, 3, 3)
lat = np.array([[float(i[4:][:12]), float(i[4:][12:25]), float(i[4:][25:38])] for i in lat]).astype(np.float64)[:, :3].reshape(-1, 3, 3)
# ele pretreatment
types = [i.split('=')[1].split(':')[0].rstrip() for i in types]
ions_per = ions_per[0].split()[-4:]
ele = sum([[i] * int(j) for i, j in zip(types, ions_per)], [])
pos_foc = np.array([i.split() for i in pos_foc]).astype(np.float64)
nframe = len(ene)
natom = len(ele)
pos = pos_foc[:, :3][:nframe * natom].reshape(nframe, natom, 3)[:nframe]
foc = pos_foc[:, 3:][:nframe * natom].reshape(nframe, natom, 3)[:nframe]
lat = lat[:nframe]
return nframe, natom, ele, ene, lat, pos, foc
if __name__ == '__main__':
# ocr_files = find_files("/data/ljx/third/lizhao/LYSO/LYSO-src/old")
#训练结构路径
ocr_files = ["./OUTCAR"]
#ocr_files = ["/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/triple/1",
# "/work/home/liz/workspace/1-system-MD/3-K5YSi4O12/9-1-lammps/6-train-NNP/outcar/triple/2"]
#训练集输出路径
out_path = "./"
if not os.path.exists(out_path):
os.mkdir(out_path)
out_data_path = os.path.join(out_path, "eval-set")
if not os.path.exists(out_data_path):
os.mkdir(out_data_path)
#每个数据取多少点,大概相乘在10000附近
smp_per_ocr = 50
# mode = 'rand'
mode = 'kde'
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Be careful when changing the code below
print(f"total trajectories: {len(ocr_files)}")
if not os.path.exists(out_data_path):
os.mkdir(out_data_path)
xsf_tot_cnt = 0
if mode == 'kde':
for ocr_i in range(len(ocr_files)):
r = RejectSamp(ocr_files[ocr_i])
n_frame, n_atom, ele, ene, lat, pos, foc = r.sampling(smp_per_ocr)
fig = r.plot()
fig_path = os.path.join(out_path, str(ocr_i))
fig.savefig(fig_path)
for j in range(n_frame):
path_j = os.path.join(out_data_path, f'structure{str(smp_per_ocr * ocr_i + j).zfill(5)}.xsf')
ene_j = ene[j]
lat_j = lat[j]
ele_j = ele
coo_j = pos[j]
foc_j = foc[j]
write2my(path_j, ene_j, lat_j, ele_j, coo_j, foc_j)
xsf_tot_cnt += 1
2-
# convert VASP OUTCAT to lammps .data (atomic)
#
# usage:
# python "this file" OUTCAR.input data.output
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ###
import numpy as np
from sys import argv
def anls_poscar(file_name: str):
with open(file_name, 'r') as f:
lines = f.readlines()
bloom_factor = float(lines[1])
lat = np.array([i.split() for i in lines[2: 5]]).astype(float)
lat = bloom_factor * lat
ele_and_num = lines[5][:-1] + ":" + lines[6][:-1]
pos_d = None
if (lines[7][0] == 'd') or (lines[7][0] == 'D'):
# direct coord.
pos_d = np.array([i.split() for i in lines[8:]]).astype(float)
elif (lines[7][0] == 'c') or (lines[7][0] == 'C'):
# cartesian
pos_c = np.array([i.split() for i in lines[8:]]).astype(float)
pos_d = direct2pos(lat, pos_c, False)
return lat, pos_d, ele_and_num
def direct2pos(lat, pos, direction=True):
"""
lat = [[Ax, Ay, Az],
[Bx, By, Bz],
[Cx, Cy, Cz]]
Pos = [n, 3]
:return: if True: return cart = [n, 3]
if False: return direct = [n, 3]
"""
if direction:
return pos @ lat
else:
return pos @ np.linalg.inv(lat)
def clip_ny_norm(x):
return x / np.linalg.norm(x)
def cart2lams(lat):
"""
vasp (POSCAR) Coordinate System:
O = [[ax, ay, az],
[bx, by, bz],
[cx, cy, cz]]
:return:
lammps Coordinate System:
O' = [[xhi - xlo, 0, 0],
[xy, yhi - ylo, 0],
[xz, yz, zhi - zlo]]
"""
ax = (lat[0] ** 2).sum() ** 0.5
a_hat = clip_ny_norm(lat[0])
bx = (lat[1] * a_hat).sum()
by = np.linalg.norm(np.cross(a_hat, lat[1]))
cx = (lat[2] * a_hat).sum()
cy = (lat[2] * np.cross(clip_ny_norm(np.cross(lat[0], lat[1])), a_hat)).sum()
cz = np.abs((lat[2] * clip_ny_norm(np.cross(lat[0], lat[1]))).sum())
new = np.array([
[ax, 0, 0],
[bx, by, 0],
[cx, cy, cz]
])
return new
def write_lmp_atomic_data(filename, lmp_lat, pos_c, strr):
new_coo = pos_c
Z_of_type = strr.split(':')[0].split()
n_type = strr.split(':')[1].split()
new_ele = [[i] * int(n_type[i - 1]) for i in range(1, len(n_type) + 1)]
new_ele = sum(new_ele, [])
# write
with open(filename, "w") as f:
f.writelines("%s (written by ARES-NNP)\n\n" % str(filename + strr))
f.writelines("%i atoms\n" % len(new_coo))
f.writelines("%i atom types\n" % len(Z_of_type))
f.writelines("0.0 %.17f xlo xhi\n" % lmp_lat[0][0])
f.writelines("0.0 %.17f ylo yhi\n" % lmp_lat[1][1])
f.writelines("0.0 %.17f zlo zhi\n" % lmp_lat[2][2])
f.writelines(
" %18.12f %18.12f %18.12f xy xz yz\n\n\n" % (lmp_lat[1][0], lmp_lat[2][0], lmp_lat[2][1]))
f.writelines("Atoms\n\n")
index_list = np.arange(1, len(new_coo) + 1)
for i, ele, coo in zip(index_list, new_ele, new_coo):
f.writelines("{0:>6} {1:>3} {2:20.12f} {3:20.12f} {4:20.12f}\n".format(i, ele, coo[0], coo[1], coo[2]))
if __name__ == '__main__':
filename = argv[2]
lat, pos, strr = anls_poscar(argv[1])
crs_para = np.linalg.norm(lat, axis=1)
exg = np.argsort(crs_para)[::-1]
lat = lat[exg, :]
pos = pos[:, exg]
lmp_lat = cart2lams(lat)
inv = np.linalg.inv(lat) @ lmp_lat
pos_c = direct2pos(lat, pos, True) @ inv
write_lmp_atomic_data(filename, lmp_lat, pos_c, strr)
3-
# 字典:元素对应的赝势文件路径
base_pseudo_path = "/public/lijx/work/Na-X-Si-O/pot-makepot/potpaw_PBE/"
pseudo_paths = {
'H': 'H/POTCAR',
'Li': 'Li_sv/POTCAR',
'Be': 'Be_sv/POTCAR',
'B': 'B/POTCAR',
'C': 'C/POTCAR',
'N': 'N/POTCAR',
'O': 'O/POTCAR',
'F': 'F/POTCAR',
'Ne': 'Ne/POTCAR',
'Na': 'Na_pv/POTCAR',
'Mg': 'Mg_pv/POTCAR',
'Al': 'Al/POTCAR',
'Si': 'Si/POTCAR',
'P': 'P/POTCAR',
'S': 'S/POTCAR',
'Cl': 'Cl/POTCAR',
'K': 'K_sv/POTCAR',
'Ca': 'Ca_sv/POTCAR',
'Sc': 'Sc_sv/POTCAR',
'Ti': 'Ti_pv/POTCAR',
'V': 'V_pv/POTCAR',
'Cr': 'Cr_pv/POTCAR',
'Mn': 'Mn_pv/POTCAR',
'Fe': 'Fe_pv/POTCAR',
'Co': 'Co/POTCAR',
'Ni': 'Ni_pv/POTCAR',
'Cu': 'Cu_pv/POTCAR',
'Zn': 'Zn/POTCAR',
'Ga': 'Ga_d/POTCAR',
'Ge': 'Ge_d/POTCAR',
'As': 'As/POTCAR',
'Se': 'Se/POTCAR',
'Br': 'Br/POTCAR',
'Kr': 'Kr/POTCAR',
'Rb': 'Rb_sv/POTCAR',
'Sr': 'Sr_sv/POTCAR',
'Y': 'Y_sv/POTCAR',
'Zr': 'Zr_sv/POTCAR',
'Nb': 'Nb_pv/POTCAR',
'Mo': 'Mo_pv/POTCAR',
'Tc': 'Tc_pv/POTCAR',
'Ru': 'Ru_pv/POTCAR',
'Rh': 'Rh_pv/POTCAR',
'Pd': 'Pd/POTCAR',
'Ag': 'Ag/POTCAR',
'Cd': 'Cd/POTCAR',
'In': 'In_d/POTCAR',
'Sn': 'Sn_d/POTCAR',
'Sb': 'Sb/POTCAR',
'Te': 'Te/POTCAR',
'I': 'I/POTCAR',
'Xe': 'Xe/POTCAR',
'Cs': 'Cs_sv/POTCAR',
'Ba': 'Ba_sv/POTCAR',
'La': 'La/POTCAR',
'Ce': 'Ce_3/POTCAR',
'Pr': 'Pr_3/POTCAR',
'Nd': 'Nd_3/POTCAR',
'Pm': 'Pm_3/POTCAR',
'Sm': 'Sm_3/POTCAR',
'Eu': 'Eu_3/POTCAR',
'Gd': 'Gd_3/POTCAR',
'Tb': 'Tb_3/POTCAR',
'Dy': 'Dy_3/POTCAR',
'Ho': 'Ho_3/POTCAR',
'Er': 'Er_3/POTCAR',
'Tm': 'Tm_3/POTCAR',
'Yb': 'Yb_2/POTCAR',
'Lu': 'Lu_3/POTCAR',
'Hf': 'Hf_pv/POTCAR',
'Ta': 'Ta_pv/POTCAR',
'W': 'W_pv/POTCAR',
'Re': 'Re_pv/POTCAR',
'Os': 'Os_pv/POTCAR',
'Ir': 'Ir/POTCAR',
'Pt': 'Pt/POTCAR',
'Au': 'Au/POTCAR',
'Hg': 'Hg/POTCAR',
'Tl': 'Tl_d/POTCAR',
'Pb': 'Pb_d/POTCAR',
'Bi': 'Bi/POTCAR',
'Th': 'Th/POTCAR',
'Pa': 'Pa/POTCAR',
'U': 'U/POTCAR',
'Np': 'Np/POTCAR',
'Pu': 'Pu/POTCAR',
# 添加其他元素和对应的赝势路径
}
for element, relative_path in pseudo_paths.items():
full_path = f"{base_pseudo_path}{relative_path}"
pseudo_paths[element] = full_path
# 读取 POSCAR 文件
def read_poscar(poscar_path):
with open(poscar_path, 'r') as f:
lines = f.readlines()
elements = lines[5].split()
periodic_table_order = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne','Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar','K','Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co','Ni', 'Cu', 'Zn','Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb','Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te','I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu','Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W','Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po','At','Rn','Fr','Ra','Ac','Th', 'Pa','U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']
sorted_elements = sorted(elements, key=lambda x: periodic_table_order.index(x))
return sorted_elements
# 创建大的 POTCAR 文件
def create_big_potcar(elements, pseudo_paths, output_path):
with open(output_path, 'w') as f_out:
for element in elements:
if element in pseudo_paths:
pseudo_path = pseudo_paths[element]
with open(pseudo_path, 'r') as f_pseudo:
f_out.write(f_pseudo.read())
else:
print(f"赝势文件不存在或未定义:{element}")
def copypot(elments,pseudo_paths):
import os
for element in elements:
output_path2 = "POT-"+element
if element in pseudo_paths:
pseudo_path = pseudo_paths[element]
with open(output_path2, 'w') as f_out:
with open(pseudo_path, 'r') as f_pseudo:
f_out.write(f_pseudo.read())
else:
print(f"赝势文件不存在或未定义:{element}")
if __name__ == "__main__":
poscar_path = "POSCAR" # 输入文件名
output_potcar_path = "POTCAR" # 输出的大 POTCAR 文件名
elements = read_poscar(poscar_path)
create_big_potcar(elements, pseudo_paths, output_potcar_path)
copypot(elements,pseudo_paths)
print("大的 POTCAR 文件已创建")
nohup ./scheduling.sh > log 2>&1 &
3. 华为新版本
1.变化
增加了server.sh
改变了iter_scheduling_remote.sh 现在,这两个都不需要做改变可以直接复制
2. 技巧
1.各个文件夹含义
pre-select :主动学习最后一步训练inv2里用的是train_dt,之前步用的看上去是pre-select,但实际上运行时用的也是train_dt
train_dt
非主动学习训练:只需要把主动学习inv2的restart改为false,不能删掉,记得更改inv2中cuda的编号,只申请了一张卡
如果一直训练出的只能跑几步,可以尝试修改select_thr = 2.0 为1.2
4. 主动学习能量、力误差
主动学习训练势过程中要求能量和力的误差不能太大,因此需要注意监控能量、力的误差
存在的一种可能是,主动学习的能量和力误差太大,但是可以正常跑完主动学习的过程(如111胞 1ns)。这时候,由于主动学习的误差很大,因此不能认为跑出的1ns结果是合理的。
目前的版本中还不能设置在能量和力达到一个标准时自动停止,因此需要多注意能量和力
提升训练精度的方法
- 学习率和训练圈数是重要的可设置的参数,通过增加训练圈数,可以有效地提高训练的精度(降低能量和力的误差)
- 清洗数据,可能存在某个误差值很大的结构,导致整体训练效果变差,把这个结构删除掉,能够有效降低误差
5. 主动学习+ minima_hopping
1. minima_hopping使用方法
minimaohopping是在ase中使用的一种采样算法,相比普通的md,采样更有效,这在华为机器liz/2-Na-X-Si-O/2_NaXSiO_big_model/1_one_element/05_Sb_mini
中得到了充分的验证。
使用minimahopping需改动的地方:
resource/md
文件夹中的data.nso 改为data.vasp,data.vasp的文件内容格式为POSCARresource/md
文件夹中增加minima_hopping.py,用于进行minima_hopping模拟,可直接copy,需要修改脚本内的元素类型resource/md
文件夹中的sub-md.sh需要改变,从提交md变为提交miniresource
文件夹中的iter_scheduling_remote.sh 有一点改变,可直接copy
sub-md.sh,iter_scheduling_remote.sh, mimima_hopping.py
可直接从华为机器/liz/2-Na-X-Si-O/2_NaXSiO_big_model/1_one_element/05_Sb_mini/resource
路径下复制到对应位置,minima_hopping.py需要修改元素类型
6. 经验总结
关于训练集
- 需要用高精度的第一性原理数据来进行势的训练
- 结构优化过程中的结构不能用来进行势的训练,因为计算的能量不准确
- 不同ENCUT和k点密度的结构不能用来训练同一个势
- 升温过程
- 增加压力点
- NPT系综
- meta-dynamics
- minim
- AIMD加大步长
- 添加拉伸剪切数据集
关于训练一个效果好的势
- 机器学习计算高温比低温更难,因此高温的数据集很重要
- 训练圈数多的要好于少,尽管大小圈数的精度可能相同
- 高精度数据集比低精度好,高精度的数据集训练的势更好
- 主动学习的势要比普通训练更优
- 如果不算压强,带不带力的计算结果变化并不大
关于训练势的误差
1.
能量的误差不在线上是没问题的,因为机器学习框架中会加一个常数项,现在学习率还是太高,导致这个常数项比较高。通过降低学习率可以解决这个问题(手动降低或者训练足够长的时间自动降低)
- 一个结构对应着1个能量,但是对应着3*原子数个力,所以一个结构的能量偏差大,对应的是许多个力的偏差大
7. 代码理解
Ap_*由sub-select根据现有的结构产生,和训练势过程和分子动力学过程无关。
主动学习和普通机器学习的区别
- 训练势时是从上一个势开始训练,增加新的数据集,重置学习率等
- 跑分子动力学时in.lmp添加al,并且使用由select.sh生成的Ap_*,用来监控并终止分子动力学,产生oos
sub-select.sh需要在GPU上跑,不然会报错
acnn -grade 用Ap_*产生某一个结构的grade,如果grade值高于某一个数,就可以加到oos中
主动学习停掉的情况
- grade值大于3且小于八累计了50个结构
- grade值大于8
主动学习的两次select
第一次是md过程中select不能判断的结构,第二次select是产生oos后对相似oos结构的一个去重,然后选择grade值相对大一些的结构(张开一个maxvol空间,选取边缘)
第一次是用现有的Ap_inv ,第二次是新产生一批Ap_inv
转载请注明来源 有问题可通过github提交issue