使用子图和图样本投影的节点回归

此 Jupyter 笔记本托管在此处的 Neo4j 图数据科学客户端 Github 仓库中。

有关此笔记本早期版本的视频演示，请参阅在 NODES 2022 会议上发表的演讲 Neo4j 图数据科学 2.x 系列基础 – 管道及更多。

此笔记本演示了如何使用节点回归管道。它还包含许多使用以下内容的示例：

便捷对象
筛选图
图样本投影

它纯粹用 Python 编写，旨在展示 GDS Python 客户端从 Cypher 查询中抽象的能力。

1. 数据集

我们的输入图表示特定主题的维基百科页面，以及它们之间的链接方式

变色龙
松鼠
鳄鱼

特征是页面文本中某些信息性名词的存在。目标是页面的平均每月流量。

该数据集首次发表于 B. Rozemberczki、C. Allen 和 R. Sarkar 的《Multi-scale Attributed Node Embedding》中，预印本 1909.13021。此处托管的版本于 2022 年 11 月 14 日取自 SNAP。

2. 前提条件

为了运行此管道，您必须拥有一个正在运行的 Neo4j DBMS，并安装了最新版本的 Neo4j 图数据科学插件。如果您有一个活跃且正在运行的 AuraDS 实例，则满足这些要求。

# First, we must install the GDS Python Client
%pip install graphdatascience

import os

# Then, we connect to our Neo4j DBMS hosting the Graph Data Science library
from graphdatascience import GraphDataScience

# Get Neo4j DB URI, credentials and name from environment if applicable
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://:7687")
NEO4J_AUTH = None
NEO4J_DB = os.environ.get("NEO4J_DB", "neo4j")
if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"):
    NEO4J_AUTH = (
        os.environ.get("NEO4J_USER"),
        os.environ.get("NEO4J_PASSWORD"),
    )
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=NEO4J_DB)

# Test our connection and print the Graph Data Science library version
print(gds.server_version())

from graphdatascience import ServerVersion

assert gds.server_version() >= ServerVersion(2, 5, 0)

# Importing the dataset

# The dataset is sourced from this GitHub repository
baseUrl = (
    "https://raw.githubusercontent.com/neo4j/graph-data-science-client/main/examples/datasets/wikipedia-animals-pages"
)

# Constraints to speed up importing
gds.run_cypher(
    """
    CREATE CONSTRAINT chameleons
    FOR (c:Chameleon)
    REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
    """
    CREATE CONSTRAINT crocodiles
    FOR (c:Crocodile)
    REQUIRE c.id IS NODE KEY
"""
)
gds.run_cypher(
    """
    CREATE CONSTRAINT squirrels
    FOR (s:Squirrel)
    REQUIRE s.id IS NODE KEY
"""
)

# Create nodes and relationships
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_edges.csv' AS row
    MERGE (c1:Chameleon {id: row.id1})
    MERGE (c2:Chameleon {id: row.id2})
    MERGE (c1)-[:LINK]->(c2)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_edges.csv' AS row
    MERGE (c1:Crocodile {id: row.id1})
    MERGE (c2:Crocodile {id: row.id2})
    MERGE (c1)-[:LINK]->(c2)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_edges.csv' AS row
    MERGE (s1:Squirrel {id: row.id1})
    MERGE (s2:Squirrel {id: row.id2})
    MERGE (s1)-[:LINK]->(s2)
""",
    {"baseUrl": baseUrl},
)

# Create target properties
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_target.csv' AS row
    MATCH (c:Chameleon {id: row.id})
    SET c.target = toInteger(row.target)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_target.csv' AS row
    MATCH (c:Crocodile {id: row.id})
    SET c.target = toInteger(row.target)
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_target.csv' AS row
    MATCH (s:Squirrel {id: row.id})
    SET s.target = toInteger(row.target)
""",
    {"baseUrl": baseUrl},
)

# Create feature vectors
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/chameleon/musae_chameleon_features.csv' AS row
    MATCH (c:Chameleon {id: row.id})
    WITH c, split(row.features, '|') AS features
    SET c.features = features
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/crocodile/musae_crocodile_features.csv' AS row
    MATCH (c:Crocodile {id: row.id})
    WITH c, split(row.features, '|') AS features
    SET c.features = features
""",
    {"baseUrl": baseUrl},
)
gds.run_cypher(
    """
    LOAD CSV WITH HEADERS FROM $baseUrl + '/squirrel/musae_squirrel_features.csv' AS row
    MATCH (c:Squirrel {id: row.id})
    WITH c, split(row.features, '|') AS features
    SET c.features = features
""",
    {"baseUrl": baseUrl},
)

3. 为管道准备数据集

为了使用该数据集，我们必须将特征准备成模型支持且能很好地协同工作的格式。原始形式的特征是特定单词的 ID，因此不适合作为线性回归的输入。

为了克服这个问题，我们将使用独热编码。这将生成适用于线性回归的特征。我们首先学习节点集中的名词字典。我们创建一个节点来托管字典，然后用它对所有特征向量进行独热编码。

# Construct one-hot dictionaries
gds.run_cypher(
    """
    MATCH (s:Chameleon)
    WITH s.features AS features
    UNWIND features AS feature
    WITH feature
      ORDER BY feature ASC
    WITH collect(distinct feature) AS orderedTotality
    CREATE (:Feature {animal: 'chameleon', totality: orderedTotality})
    RETURN orderedTotality
"""
)
gds.run_cypher(
    """
    MATCH (s:Crocodile)
    WITH s.features AS features
    UNWIND features AS feature
    WITH feature
      ORDER BY feature ASC
    WITH collect(distinct feature) AS orderedTotality
    CREATE (:Feature {animal: 'crocodile', totality: orderedTotality})
    RETURN orderedTotality
"""
)
gds.run_cypher(
    """
    MATCH (s:Squirrel)
    WITH s.features AS features
    UNWIND features AS feature
    WITH feature
      ORDER BY feature ASC
    WITH collect(distinct feature) AS orderedTotality
    CREATE (:Feature {animal: 'squirrel', totality: orderedTotality})
    RETURN orderedTotality
"""
)

# Do one-hot encoding
gds.run_cypher(
    """
    MATCH (f:Feature {animal: 'chameleon'})
    MATCH (c:Chameleon)
    SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
    """
    MATCH (f:Feature {animal: 'crocodile'})
    MATCH (c:Crocodile)
    SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)
gds.run_cypher(
    """
    MATCH (f:Feature {animal: 'squirrel'})
    MATCH (c:Squirrel)
    SET c.features_one_hot = gds.alpha.ml.oneHotEncoding(f.totality, c.features)
"""
)

# First, let's project our graph into the GDS Graph Catalog
# We will use a native projection to begin with
G_animals, projection_result = gds.graph.project(
    "wiki_animals",
    ["Chameleon", "Squirrel", "Crocodile"],
    {"LINK": {"orientation": "UNDIRECTED"}},
    nodeProperties=["features_one_hot", "target"],
)
print(projection_result[["graphName", "nodeCount", "relationshipCount"]])

4. 连接性

在图分析中，通常只对连通图进行操作。即，仅由单个组件组成的图。这样做的原因是，在大多数情况下，没有连接的地方信息不会流动。

确定图中组件数量的最快方法是使用 WCC（弱连通分量）算法。

# We use the WCC algorithm to see how many components we have
wcc_result = gds.wcc.mutate(G_animals, mutateProperty="wcc_component")

print(wcc_result[["computeMillis", "componentCount"]])

5. 组件分离

了解到我们的图由三个组件组成后，接下来我们将把这些组件分离成独立的图。我们将使用 subgraph 投影来完成此操作。我们将为每个组件创建一个子图。

# First, we stream the component ids
components = gds.graph.nodeProperty.stream(G_animals, "wcc_component")

# Second, we compute the unique component ids
component_ids = components["propertyValue"].unique()

# Third, we project a subgraph for each component
component_graphs = [
    gds.graph.filter(
        f"animals_component_{component_id}",
        G_animals,
        f"n.wcc_component = {component_id}",
        "*",
    )[0]
    for component_id in component_ids
]

# Lastly, we map the node labels in the graphs to the graph
graph_components_by_labels = {str(G_component.node_labels()): G_component for G_component in component_graphs}

print({k: v.name() for k, v in graph_components_by_labels.items()})

# Now, we are only interested in the Chameleon graph,
# so we will drop the other graphs and define a better variable for the one we keep
graph_components_by_labels[str(["Crocodile"])].drop()
graph_components_by_labels[str(["Squirrel"])].drop()
G_chameleon = graph_components_by_labels[str(["Chameleon"])]

# With the graph object G_chameleon, we can inspect some statistics
print("#nodes: " + str(G_chameleon.node_count()))
print("#relationships: " + str(G_chameleon.relationship_count()))
print("Degree distribution")
print("=" * 25)
print(G_chameleon.degree_distribution().sort_index())

6. 现在，让我们构建一个训练管道！

我们将创建一个节点回归管道，然后：

配置拆分
添加模型候选
配置自动调优
添加节点属性步骤
选择模型特征

该管道位于管道目录中，我们通过管道对象对其进行操作，以实现最大便利性。

# Now, let's construct a training pipeline!
chameleons_nr_training = gds.nr_pipe("node_regression_pipeline__Chameleons")

# We configure the splitting
chameleons_nr_training.configureSplit(validationFolds=5, testFraction=0.2)

# We add a set of model candidates
# A linear regression model with the learningRate parameter in a search space
chameleons_nr_training.addLinearRegression(
    penalty=1e-5,
    patience=3,
    tolerance=1e-5,
    minEpochs=20,
    maxEpochs=500,
    learningRate={"range": [100, 1000]},  # We let the auto-tuner find a good value
)
# Let's try a few different models
chameleons_nr_training.configureAutoTuning(maxTrials=10)

# Our input feature dimension is 3132
# We can reduce the dimension to speed up training using a FastRP node embedding
chameleons_nr_training.addNodeProperty(
    "fastRP",
    embeddingDimension=256,
    propertyRatio=0.8,
    featureProperties=["features_one_hot"],
    mutateProperty="frp_embedding",
    randomSeed=420,
)

# And finally we select what features the model should be using
# We rely on the FastRP embedding solely, because it encapsulates the one-hot encoded source features
chameleons_nr_training.selectFeatures("frp_embedding")

# The training pipeline is now fully configured and ready to be run!

# We use the training pipeline to train a model
nc_model, train_result = chameleons_nr_training.train(
    G_chameleon,  # First, we use the entire Chameleon graph
    modelName="chameleon_nr_model",
    targetNodeLabels=["Chameleon"],
    targetProperty="target",
    metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
    randomSeed=420,
)

print("Winning model parameters: \n\t\t" + str(train_result["modelInfo"]["bestParameters"]))
print()
print("MEAN_SQUARED_ERROR      test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"]))
print("MEAN_ABSOLUTE_ERROR     test score: " + str(train_result["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"]))

# Let's sample the graph to see if we can get a similarly good model
G_chameleon_sample, _ = gds.alpha.graph.sample.rwr(
    "cham_sample",
    G_chameleon,
    samplingRatio=0.30,  # We'll use 30% of the graph
)

# Now we can use the same training pipeline to train another model, but faster!
nc_model_sample, train_result_sample = chameleons_nr_training.train(
    G_chameleon_sample,
    modelName="chameleon_nr_model_sample",
    targetNodeLabels=["Chameleon"],
    targetProperty="target",
    metrics=["MEAN_SQUARED_ERROR", "MEAN_ABSOLUTE_ERROR"],
    randomSeed=420,
)

print("Winning model parameters: \n\t\t" + str(train_result_sample["modelInfo"]["bestParameters"]))
print()
print(
    "MEAN_SQUARED_ERROR      test score: "
    + str(train_result_sample["modelInfo"]["metrics"]["MEAN_SQUARED_ERROR"]["test"])
)
print(
    "MEAN_ABSOLUTE_ERROR     test score: "
    + str(train_result_sample["modelInfo"]["metrics"]["MEAN_ABSOLUTE_ERROR"]["test"])
)

# Let's see what our models predict

# The speed-trained model on 24% training data (30% sample - 20% test set)
predicted_targets_sample = nc_model_sample.predict_stream(G_chameleon)
# The fully trained model on 80% training data (20% test set)
predicted_targets_full = nc_model.predict_stream(G_chameleon)

# The original training data for comparison
real_targets = gds.graph.nodeProperty.stream(G_chameleon, "target")

# Merging the data frames
merged_full = real_targets.merge(predicted_targets_full, left_on="nodeId", right_on="nodeId")
merged_all = merged_full.merge(predicted_targets_sample, left_on="nodeId", right_on="nodeId")

# Look at the last 10 rows
print(merged_all.tail(10))