1. 按照以下代码,从result文件中选出丰度在前30的菌

    # -*- coding: utf-8 -*-
    """菌种丰度分析(行列转置版).ipynb
    
    Automatically generated by Colaboratory.
    """
    
    # ================== 环境准备 ==================
    !pip install openpyxl
    import pandas as pd
    import re
    from google.colab import files
    from IPython.display import clear_output
    
    # ================== 文件上传 ==================
    def upload_excel():
        """安全上传Excel文件"""
        print("请上传包含菌种丰度的Excel文件")
        uploaded = files.upload()
        clear_output(wait=True)
        
        if not uploaded:
            raise ValueError("⚠️ 未检测到文件,请重新上传")
        
        excel_files = [k for k in uploaded.keys() if k.endswith(('.xlsx', '.xls'))]
        if not excel_files:
            raise ValueError("❌ 仅支持.xlsx或.xls文件")
        
        print(f"✅ 已上传文件: {excel_files[0]}")
        return excel_files[0]
    
    # ================== 数据加载 ==================
    try:
        file_name = upload_excel()
        
        # 读取数据(菌种为行,样本为列)
        df = pd.read_excel(
            file_name,
            engine='openpyxl',
            index_col=0  # 首列为菌种名称
        )
        
        print("\\n数据加载成功!行列转置前格式:")
        print(f"菌种数量: {df.shape[0]}, 样本数量: {df.shape[1]}")
        display(df.head(2))
    except Exception as e:
        print(f"错误: {str(e)}")
        raise
    
    # ================== 分组处理 ==================
    def classify_sample(col):
        """样本名称分类器"""
        match = re.search(r'(CTL|CC1|HPP)\\d*$', str(col))
        return match.group(1) if match else None
    
    # 建立分组字典
    groups = {'CTL': [], 'CC1': [], 'HPP': []}
    for col in df.columns:
        if (group := classify_sample(col)) and group in groups:
            groups[group].append(col)
    
    # ================== 核心计算 ==================
    results = {}
    for group, samples in groups.items():
        if not samples:
            print(f"⚠️ 警告:{group}组无样本数据")
            continue
        
        # 计算组内均值
        group_mean = df[samples].mean(axis=1)
        
        # 取TOP30菌种并转置
        top30 = group_mean.nlargest(30).to_frame(name=group).T  # 关键转置操作
        
        # 保存结果
        results[group] = top30
        print(f"\\n{group}组结果预览:")
        display(top30.head())
    
    # ================== 结果输出 ==================
    # 合并所有结果
    final_df = pd.concat(results.values())
    
    # 添加样本名称列(首列)
    final_df.reset_index(inplace=True)
    final_df.rename(columns={'index':'样本组别'}, inplace=True)
    
    # 生成下载文件
    output = final_df.to_csv(index=False, encoding='utf-8-sig').encode()
    files.download(output, '组别丰度TOP30汇总.csv')
    
    print("""
    ✅ 处理完成!生成文件说明:
    --------------------------------------------------
    文件名:组别丰度TOP30汇总.csv
    文件结构:
    - 第一列 "样本组别":CTL/CC1/HPP
    - 后续各列:TOP30菌种名称(按丰度从高到低排列)
    - 数值表示对应组别的平均丰度值
    """)
    
  2. 标准化处理数据

  3. 斯皮尔曼联合分析

    # -*- coding: utf-8 -*-
    """
    菌群-理化指标联合分析流程
    功能:数据标准化 → 斯皮尔曼相关 → 网络文件生成
    环境需求:Python 3.8+ | 需安装 pandas, numpy, scipy, statsmodels
    """
    
    # ================== 库加载 ==================
    import pandas as pd
    import numpy as np
    from scipy import stats
    from statsmodels.stats.multitest import fdrcorrection
    
    # ================== 参数设置 ==================
    INPUT_FILE = "microbial_physico_data.xlsx"  # 输入文件路径
    SAMPLE_PREFIX = ['CTL', 'CC1', 'HPP']      # 样本组别前缀
    TOP_N = 30                                 # 每组选取TOP N菌种
    RHO_THRESH = 0.6                           # 相关系数阈值
    FDR_ALPHA = 0.05                           # 显著性水平
    
    # ================== 数据加载 ==================
    def load_data():
        """读取Excel文件,分离菌群和理化数据"""
        raw_data = pd.read_excel(INPUT_FILE, index_col=0)
        
        # 识别理化指标列(假设列名包含'_phy_'标识)
        physico_cols = [col for col in raw_data.columns if '_phy_' in col]
        microbial_cols = list(set(raw_data.columns) - set(physico_cols))
        
        return raw_data[microbial_cols], raw_data[physico_cols]
    
    # ================== 数据标准化 ==================
    def normalize_data(micro_df, physico_df):
        """执行双模块标准化"""
        # 菌群总和标准化
        micro_norm = micro_df.div(micro_df.sum(axis=1), axis=1) * 1e6
        
        # 理化指标标准化
        physico_norm = pd.DataFrame()
        for col in physico_df.columns:
            # 连续型指标
            if physico_df[col].nunique() > 10:  
                physico_norm[col] = (physico_df[col] - physico_df[col].mean()) / physico_df[col].std()
            # 成分型数据(如百分比)
            elif physico_df[col].max() <= 1:  
                geom_mean = stats.gmean(physico_df[col]+1e-5)
                physico_norm[col] = np.log((physico_df[col]+1e-5) / geom_mean)
            # 分类变量
            else:  
                physico_norm = pd.concat([physico_norm, 
                                        pd.get_dummies(physico_df[col], prefix=col)], axis=1)
        
        return micro_norm, physico_norm
    
    # ================== 特征筛选 ==================
    def select_features(micro_norm):
        """按组别筛选TOP菌种"""
        group_dict = {prefix: [] for prefix in SAMPLE_PREFIX}
        
        # 样本分组
        for sample in micro_norm.index:
            for prefix in SAMPLE_PREFIX:
                if sample.startswith(prefix):
                    group_dict[prefix].append(sample)
        
        # 计算组均值并选取TOP
        top_species = []
        for prefix, samples in group_dict.items():
            if samples:
                group_mean = micro_norm.loc[samples].mean(axis=0)
                top_species += group_mean.nlargest(TOP_N).index.tolist()
        
        return list(set(top_species))  # 去重
    
    # ================== 相关性分析 ==================
    def calculate_correlation(micro_norm, physico_norm, top_species):
        """计算斯皮尔曼相关系数"""
        # 合并数据
        combined_df = pd.concat([micro_norm[top_species], physico_norm], axis=1)
        
        # 矩阵计算
        rho_matrix, pval_matrix = stats.spearmanr(combined_df, axis=0, nan_policy='omit')
        
        # 转换为DataFrame
        features = combined_df.columns.tolist()
        rho_df = pd.DataFrame(rho_matrix, index=features, columns=features)
        pval_df = pd.DataFrame(pval_matrix, index=features, columns=features)
        
        # FDR校正
        mask = np.triu(np.ones(pval_df.shape), k=1).astype(bool)
        reject, fdr_pvals = fdrcorrection(pval_df.values[mask], alpha=FDR_ALPHA)
        
        return rho_df, pval_df, fdr_pvals
    
    # ================== 生成网络文件 ==================
    def generate_cytoscape_files(rho_df, pval_df, fdr_pvals):
        """生成节点和边文件"""
        # 边文件
        edges = []
        n_features = rho_df.shape[0]
        k = 0
        for i in range(n_features):
            for j in range(i+1, n_features):
                if abs(rho_df.iloc[i,j]) > RHO_THRESH and fdr_pvals[k] < FDR_ALPHA:
                    edges.append({
                        'Source': rho_df.columns[i],
                        'Target': rho_df.columns[j],
                        'Rho': rho_df.iloc[i,j],
                        'p_value': pval_df.iloc[i,j],
                        'FDR': fdr_pvals[k]
                    })
                k += 1
        edge_df = pd.DataFrame(edges)
        
        # 节点文件
        nodes = [{'Id': col, 'Type': 'Microbe' if col in micro_cols else 'Physico'} 
                 for col in rho_df.columns]
        node_df = pd.DataFrame(nodes)
        
        # 保存
        edge_df.to_csv("Cytoscape_Edges.csv", index=False)
        node_df.to_csv("Cytoscape_Nodes.csv", index=False)
    
    # ================== 主流程 ==================
    if __name__ == "__main__":
        # 数据加载与预处理
        micro_raw, physico_raw = load_data()
        micro_norm, physico_norm = normalize_data(micro_raw, physico_raw)
        
        # 特征筛选
        top_species = select_features(micro_norm)
        
        # 相关性分析
        rho_df, pval_df, fdr_pvals = calculate_correlation(micro_norm, physico_norm, top_species)
        
        # 生成网络文件
        generate_cytoscape_files(rho_df, pval_df, fdr_pvals)
        print("处理完成!生成文件:Cytoscape_Edges.csv 和 Cytoscape_Nodes.csv")
    
  4. 关联网络可视化作图