按照以下代码,从result文件中选出丰度在前30的菌
# -*- coding: utf-8 -*-
"""菌种丰度分析(行列转置版).ipynb
Automatically generated by Colaboratory.
"""
# ================== 环境准备 ==================
!pip install openpyxl
import pandas as pd
import re
from google.colab import files
from IPython.display import clear_output
# ================== 文件上传 ==================
def upload_excel():
"""安全上传Excel文件"""
print("请上传包含菌种丰度的Excel文件")
uploaded = files.upload()
clear_output(wait=True)
if not uploaded:
raise ValueError("⚠️ 未检测到文件,请重新上传")
excel_files = [k for k in uploaded.keys() if k.endswith(('.xlsx', '.xls'))]
if not excel_files:
raise ValueError("❌ 仅支持.xlsx或.xls文件")
print(f"✅ 已上传文件: {excel_files[0]}")
return excel_files[0]
# ================== 数据加载 ==================
try:
file_name = upload_excel()
# 读取数据(菌种为行,样本为列)
df = pd.read_excel(
file_name,
engine='openpyxl',
index_col=0 # 首列为菌种名称
)
print("\\n数据加载成功!行列转置前格式:")
print(f"菌种数量: {df.shape[0]}, 样本数量: {df.shape[1]}")
display(df.head(2))
except Exception as e:
print(f"错误: {str(e)}")
raise
# ================== 分组处理 ==================
def classify_sample(col):
"""样本名称分类器"""
match = re.search(r'(CTL|CC1|HPP)\\d*$', str(col))
return match.group(1) if match else None
# 建立分组字典
groups = {'CTL': [], 'CC1': [], 'HPP': []}
for col in df.columns:
if (group := classify_sample(col)) and group in groups:
groups[group].append(col)
# ================== 核心计算 ==================
results = {}
for group, samples in groups.items():
if not samples:
print(f"⚠️ 警告:{group}组无样本数据")
continue
# 计算组内均值
group_mean = df[samples].mean(axis=1)
# 取TOP30菌种并转置
top30 = group_mean.nlargest(30).to_frame(name=group).T # 关键转置操作
# 保存结果
results[group] = top30
print(f"\\n{group}组结果预览:")
display(top30.head())
# ================== 结果输出 ==================
# 合并所有结果
final_df = pd.concat(results.values())
# 添加样本名称列(首列)
final_df.reset_index(inplace=True)
final_df.rename(columns={'index':'样本组别'}, inplace=True)
# 生成下载文件
output = final_df.to_csv(index=False, encoding='utf-8-sig').encode()
files.download(output, '组别丰度TOP30汇总.csv')
print("""
✅ 处理完成!生成文件说明:
--------------------------------------------------
文件名:组别丰度TOP30汇总.csv
文件结构:
- 第一列 "样本组别":CTL/CC1/HPP
- 后续各列:TOP30菌种名称(按丰度从高到低排列)
- 数值表示对应组别的平均丰度值
""")
标准化处理数据
菌群丰度:采用综合标准化法将各样本总丰度归一化为1×10⁶ reads,消除测序深度差异;
理化指标
对于总酸度和氨基酸态氮含量的成分型指标来说,采用中心对数比变换CLR处理,其中g(x)为几何均值;
$$ CLR=log(x)-log(g(x)) $$
对于pH等连续型指标来说,采用Z-score标准化。
$$ z=(x-μ)/σ $$
斯皮尔曼联合分析
# -*- coding: utf-8 -*-
"""
菌群-理化指标联合分析流程
功能:数据标准化 → 斯皮尔曼相关 → 网络文件生成
环境需求:Python 3.8+ | 需安装 pandas, numpy, scipy, statsmodels
"""
# ================== 库加载 ==================
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection
# ================== 参数设置 ==================
INPUT_FILE = "microbial_physico_data.xlsx" # 输入文件路径
SAMPLE_PREFIX = ['CTL', 'CC1', 'HPP'] # 样本组别前缀
TOP_N = 30 # 每组选取TOP N菌种
RHO_THRESH = 0.6 # 相关系数阈值
FDR_ALPHA = 0.05 # 显著性水平
# ================== 数据加载 ==================
def load_data():
"""读取Excel文件,分离菌群和理化数据"""
raw_data = pd.read_excel(INPUT_FILE, index_col=0)
# 识别理化指标列(假设列名包含'_phy_'标识)
physico_cols = [col for col in raw_data.columns if '_phy_' in col]
microbial_cols = list(set(raw_data.columns) - set(physico_cols))
return raw_data[microbial_cols], raw_data[physico_cols]
# ================== 数据标准化 ==================
def normalize_data(micro_df, physico_df):
"""执行双模块标准化"""
# 菌群总和标准化
micro_norm = micro_df.div(micro_df.sum(axis=1), axis=1) * 1e6
# 理化指标标准化
physico_norm = pd.DataFrame()
for col in physico_df.columns:
# 连续型指标
if physico_df[col].nunique() > 10:
physico_norm[col] = (physico_df[col] - physico_df[col].mean()) / physico_df[col].std()
# 成分型数据(如百分比)
elif physico_df[col].max() <= 1:
geom_mean = stats.gmean(physico_df[col]+1e-5)
physico_norm[col] = np.log((physico_df[col]+1e-5) / geom_mean)
# 分类变量
else:
physico_norm = pd.concat([physico_norm,
pd.get_dummies(physico_df[col], prefix=col)], axis=1)
return micro_norm, physico_norm
# ================== 特征筛选 ==================
def select_features(micro_norm):
"""按组别筛选TOP菌种"""
group_dict = {prefix: [] for prefix in SAMPLE_PREFIX}
# 样本分组
for sample in micro_norm.index:
for prefix in SAMPLE_PREFIX:
if sample.startswith(prefix):
group_dict[prefix].append(sample)
# 计算组均值并选取TOP
top_species = []
for prefix, samples in group_dict.items():
if samples:
group_mean = micro_norm.loc[samples].mean(axis=0)
top_species += group_mean.nlargest(TOP_N).index.tolist()
return list(set(top_species)) # 去重
# ================== 相关性分析 ==================
def calculate_correlation(micro_norm, physico_norm, top_species):
"""计算斯皮尔曼相关系数"""
# 合并数据
combined_df = pd.concat([micro_norm[top_species], physico_norm], axis=1)
# 矩阵计算
rho_matrix, pval_matrix = stats.spearmanr(combined_df, axis=0, nan_policy='omit')
# 转换为DataFrame
features = combined_df.columns.tolist()
rho_df = pd.DataFrame(rho_matrix, index=features, columns=features)
pval_df = pd.DataFrame(pval_matrix, index=features, columns=features)
# FDR校正
mask = np.triu(np.ones(pval_df.shape), k=1).astype(bool)
reject, fdr_pvals = fdrcorrection(pval_df.values[mask], alpha=FDR_ALPHA)
return rho_df, pval_df, fdr_pvals
# ================== 生成网络文件 ==================
def generate_cytoscape_files(rho_df, pval_df, fdr_pvals):
"""生成节点和边文件"""
# 边文件
edges = []
n_features = rho_df.shape[0]
k = 0
for i in range(n_features):
for j in range(i+1, n_features):
if abs(rho_df.iloc[i,j]) > RHO_THRESH and fdr_pvals[k] < FDR_ALPHA:
edges.append({
'Source': rho_df.columns[i],
'Target': rho_df.columns[j],
'Rho': rho_df.iloc[i,j],
'p_value': pval_df.iloc[i,j],
'FDR': fdr_pvals[k]
})
k += 1
edge_df = pd.DataFrame(edges)
# 节点文件
nodes = [{'Id': col, 'Type': 'Microbe' if col in micro_cols else 'Physico'}
for col in rho_df.columns]
node_df = pd.DataFrame(nodes)
# 保存
edge_df.to_csv("Cytoscape_Edges.csv", index=False)
node_df.to_csv("Cytoscape_Nodes.csv", index=False)
# ================== 主流程 ==================
if __name__ == "__main__":
# 数据加载与预处理
micro_raw, physico_raw = load_data()
micro_norm, physico_norm = normalize_data(micro_raw, physico_raw)
# 特征筛选
top_species = select_features(micro_norm)
# 相关性分析
rho_df, pval_df, fdr_pvals = calculate_correlation(micro_norm, physico_norm, top_species)
# 生成网络文件
generate_cytoscape_files(rho_df, pval_df, fdr_pvals)
print("处理完成!生成文件:Cytoscape_Edges.csv 和 Cytoscape_Nodes.csv")
关联网络可视化作图
软件:Cytoscape(官网免费安装)
如果这个压缩包打不开或者安完以后没有这两个插件LAYOUT TOOLS和Legend panel可能要麻烦师兄再去安装一下(在侧边栏这里)

要用到的文件就是python最后生成的两个文件,但是师兄有些操作我有点忘了,但是我可以把之前看过的视频和帖子附在这里,我当时就是照着做的
【如何使用cytoscape制作中药-成分-靶点网络关系图】 https://www.bilibili.com/video/BV1zo4y1m7eb/?share_source=copy_web&vd_source=7341bc3bc80eba5824d16b5a9dd6c72a