Merge pull request #68 from Nicole00/algo

add nebula-algorithm module
vesoft-inc · May 8, 2021 · 6be142e · 6be142e
2 parents 9c2267a + e07fb0a
commit 6be142e
Show file tree

Hide file tree

Showing 38 changed files with 3,146 additions and 0 deletions.
diff --git a/nebula-algorithm/README.md b/nebula-algorithm/README.md
@@ -0,0 +1,72 @@
+# 欢迎使用 nebula-algorithm
+
+nebula-algorithm 是一款基于 [GraphX](https://spark.apache.org/graphx/) 的 Spark 应用程序，提供了以下图计算算法：
+
+
+ |           算法名          |中文说明|应用场景|
+ |:------------------------:|:-----------:|:----:|
+ |         PageRank         |  页面排序  | 网页排序、重点节点挖掘|
+ |         Louvain          |  社区发现  | 社团挖掘、层次化聚类|
+ |          KCore           |    K核    |社区发现、金融风控|
+ |     LabelPropagation     |  标签传播  |资讯传播、广告推荐、社区发现|
+ |    ConnectedComponent    |  联通分量  |社区发现、孤岛发现|
+ |StronglyConnectedComponent| 强联通分量  |社区发现|
+ |       ShortestPath       |  最短路径   |路径规划、网络规划|
+ |       TriangleCount      | 三角形计数  |网络结构分析|
+ |   BetweennessCentrality  | 介数中心性  |关键节点挖掘，节点影响力计算|
+ |        DegreeStatic      |   度统计   |图结构分析|
+
+使用 `nebula-algorithm`，可以通过提交 `Spark` 任务的形式使用完整的算法工具对 `Nebula Graph` 数据库中的数据执行图计算，也可以通过编程形式调用`lib`库下的算法针对DataFrame执行图计算。
+
+## 编译 Nebula-Algorithm
+
+```
+$ git clone https://github.com/vesoft-inc/nebula-spark-utils.git
+$ cd nebula-algorithm
+$ mvn clean package -Dgpg.skip -Dmaven.javadoc.skip=true -Dmaven.test.skip=true
+```
+编译完成后，在 `nebula-algorithm/target` 目录下会生成 `nebula-algorithm-2.0.0.jar` 。
+
+# 使用 Nebula-Algorithm
+
+   使用限制：Nebula-Algorithm 未自动对字符串id进行编码，因此执行图算法时，边的源点和目标点必须是整数（Nebula Space 的 vid_type可以是String类型，但数据必须是整数）。
+
+* 使用方法1：直接提交 nebula-algorithm 算法包
+
+   * 设置配置文件
+
+    关于配置项的具体说明参考[示例配置](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-spark-utils/nebula-algorithm/src/main/resources/application.conf)
+
+   * 提交算法任务
+
+    ```
+    ${SPARK_HOME}/bin/spark-submit --master <mode> --class com.vesoft.nebula.algorithm.Main nebula-algorithm-2.0.0.jar -p property_file
+    ```
+* 使用方法2：调用 nebula-algorithm 算法接口
+
+   在`nebula-algorithm`的`lib`库中提供了10中常用图计算算法，可通过编程调用的形式调用算法。
+   * 在pom.xml中添加依赖
+   ```
+    <dependency>
+         <groupId>com.vesoft</groupId>
+         <artifactId>nebula-algorithm</artifactId>
+         <version>2.0.0</version>
+    </dependency>
+   ```
+   * 定义算法参数调用算法（以`PageRank`为例）
+   ```
+   val prConfig = new PRConfig(5, 1.0)
+   val louvainResult = PageRankAlgo.apply(spark, data, prConfig, false)
+   ```
+
+    其他算法的调用方法见[测试示例](https://github.com/vesoft-inc/nebula-spark-utils/tree/master/nebula-spark-utils/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib) 。
+
+    *注：执行算法的DataFrame默认第一列是源点，第二列是目标点，第三列是边权重。*
+
+## 贡献
+
+nebula-algorithm 是一个完全开源的项目，欢迎开源爱好者通过以下方式参与：
+
+- 前往 [Nebula Graph 论坛](https://discuss.nebula-graph.com.cn/ "点击前往“Nebula Graph 论坛") 上参与 Issue 讨论，如答疑、提供想法或者报告无法解决的问题
+- 撰写或改进文档
+- 提交优化代码
diff --git a/nebula-algorithm/pom.xml b/nebula-algorithm/pom.xml
@@ -0,0 +1,176 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>nebula-spark</artifactId>
+        <groupId>com.vesoft</groupId>
+        <version>2.0.0</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <modelVersion>4.0.0</modelVersion>
+
+    <artifactId>nebula-algorithm</artifactId>
+
+    <properties>
+        <spark.version>2.4.4</spark.version>
+        <nebula.version>2.0.0</nebula.version>
+        <config.version>1.4.0</config.version>
+        <scopt.version>3.7.1</scopt.version>
+        <scalatest.version>3.2.0</scalatest.version>
+        <junit.version>4.13.1</junit.version>
+        <compiler.source.version>1.8</compiler.source.version>
+        <compiler.target.version>1.8</compiler.target.version>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-core_2.11</artifactId>
+            <version>${spark.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-sql_2.11</artifactId>
+            <version>${spark.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.spark</groupId>
+            <artifactId>spark-graphx_2.11</artifactId>
+            <version>${spark.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.vesoft</groupId>
+            <artifactId>nebula-spark-connector</artifactId>
+            <version>${nebula.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.typesafe</groupId>
+            <artifactId>config</artifactId>
+            <version>${config.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>com.github.scopt</groupId>
+            <artifactId>scopt_2.11</artifactId>
+            <version>${scopt.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.scalatest</groupId>
+            <artifactId>scalatest_2.11</artifactId>
+            <version>${scalatest.version}</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>${junit.version}</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+
+    <build>
+        <testSourceDirectory>src/main/scala</testSourceDirectory>
+        <plugins>
+            <!-- maven-jar -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>3.2.0</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+            <!-- maven-compiler -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <configuration>
+                    <source>${compiler.source.version}</source>
+                    <target>${compiler.target.version}</target>
+                </configuration>
+            </plugin>
+
+            <!-- maven-shade -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>3.2.1</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>false</createDependencyReducedPom>
+                            <artifactSet>
+                                <excludes>
+                                    <exclude>org.apache.spark:*</exclude>
+                                    <exclude>org.apache.hadoop:*</exclude>
+                                    <exclude>org.apache.hive:*</exclude>
+                                    <exclude>log4j:log4j</exclude>
+                                    <exclude>org.apache.orc:*</exclude>
+                                    <exclude>xml-apis:xml-apis</exclude>
+                                    <exclude>javax.inject:javax.inject</exclude>
+                                    <exclude>org.spark-project.hive:hive-exec</exclude>
+                                    <exclude>stax:stax-api</exclude>
+                                    <exclude>org.glassfish.hk2.external:aopalliance-repackaged</exclude>
+                                </excludes>
+                            </artifactSet>
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>com/vesoft/tools/**</exclude>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+            <!-- scala-maven -->
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+                <version>3.2.2</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>compile</goal>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+            <!-- scala-test -->
+            <plugin>
+                <groupId>org.scalatest</groupId>
+                <artifactId>scalatest-maven-plugin</artifactId>
+                <version>2.0.0</version>
+                <executions>
+                    <execution>
+                        <id>test</id>
+                        <goals>
+                            <goal>test</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+        </plugins>
+    </build>
+
+</project>
diff --git a/nebula-algorithm/src/main/resources/application.conf b/nebula-algorithm/src/main/resources/application.conf
@@ -0,0 +1,126 @@
+{
+  # Spark relation config
+  spark: {
+    app: {
+        name: LPA
+        # spark.app.partitionNum
+        partitionNum:100
+    }
+    master:local
+  }
+
+  data: {
+    # data source. optional of nebula,csv,json
+    source: csv
+    # data sink, means the algorithm result will be write into this sink. optional of nebula,csv,text
+    sink: nebula
+    # if your algorithm needs weight
+    hasWeight: false
+  }
+
+  # Nebula Graph relation config
+  nebula: {
+    # algo's data source from Nebula. If data.source is nebula, then this nebula.read config can be valid.
+    read: {
+        # Nebula metad server address, multiple addresses are split by English comma
+        metaAddress: "127.0.0.1:9559"
+        # Nebula space
+        space: nb
+        # Nebula edge types, multiple labels means that data from multiple edges will union together
+        labels: ["serve"]
+        # Nebula edge property name for each edge type, this property will be as weight col for algorithm.
+        # Make sure the weightCols are corresponding to labels.
+        weightCols: ["start_year"]
+    }
+
+    # algo result sink into Nebula. If data.sink is nebula, then this nebula.write config can be valid.
+    write:{
+        # Nebula graphd server address， multiple addresses are split by English comma
+        graphAddress: "127.0.0.1:9669"
+        # Nebula metad server address, multiple addresses are split by English comma
+        metaAddress: "127.0.0.1:9559,127.0.0.1:9560"
+        user:root
+        pswd:nebula
+        # Nebula space name
+        space:nb
+        # Nebula tag name, the algorithm result will be write into this tag
+        tag:pagerank
+    }
+  }
+
+  local: {
+    # algo's data source from Nebula. If data.source is csv or json, then this local.read can be valid.
+    read:{
+        filePath: "hdfs://127.0.0.1:9000/edge/work_for.csv"
+        # srcId column
+        srcId:"_c0"
+        # dstId column
+        dstId:"_c1"
+        # weight column
+        #weight: "col3"
+        # if csv file has header
+        header: false
+        # csv file's delimiter
+        delimiter:","
+    }
+
+    # algo result sink into local file. If data.sink is csv or text, then this local.write can be valid.
+    write:{
+        resultPath:/tmp/
+    }
+  }
+
+
+  algorithm: {
+    # the algorithm that you are going to execute，pick one from [pagerank, louvain, connectedcomponent,
+    # labelpropagation, shortestpaths, degreestatic, kcore, stronglyconnectedcomponent, trianglecount,
+    # betweenness]
+    executeAlgo: pagerank
+
+    # pagerank parameter
+    pagerank: {
+        maxIter: 10
+        resetProb: 0.15  # default 0.15
+    }
+
+    # louvain parameter
+    louvain: {
+        maxIter: 20
+        internalIter: 10
+        tol: 0.5
+   }
+
+   # connected component parameter  TODO not implemented yet.
+    connectedcomponent: {
+        maxIter: 20
+   }
+
+   # LabelPropagation
+    labelpropagation: {
+        maxIter: 20
+   }
+
+   # ShortestPaths
+    shortestpaths: {
+        # several vertices to compute the shortest path to all vertices.
+        landmarks: "1"
+   }
+
+    # vertex degree static
+    degreestatic: {}
+
+   # kcore
+   kcore:{
+        maxIter:10
+        degree:1
+   }
+
+   # trianglecount
+   trianglecount:{}
+
+   # betweenness centrality
+   betweenness:{
+        maxIter:5
+   }
+ }
+}
diff --git a/nebula-algorithm/src/main/resources/edge b/nebula-algorithm/src/main/resources/edge
@@ -0,0 +1,10 @@
+{"src":12345,"dst":23456,"degree":34, "descr": "aaa","timep": "2020-01-01"}
+{"src":11111,"dst":22222,"degree":33, "descr": "aaa","timep": "2020-01-01"}
+{"src":11111,"dst":33333,"degree":32, "descr": "a\baa","timep": "2020-01-01"}
+{"src":11111,"dst":44444,"degree":31, "descr": "aaa","timep": "2020-01-01"}
+{"src":22222,"dst":55555,"degree":30, "descr": "a\naa","timep": "2020-01-01"}
+{"src":33333,"dst":44444,"degree":29, "descr": "aaa","timep": "2020-01-01"}
+{"src":33333,"dst":55555,"degree":28, "descr": "aa\ta","timep": "2020-01-01"}
+{"src":44444,"dst":22222,"degree":27, "descr": "aaa","timep": "2020-01-01"}
+{"src":44444,"dst":55555,"degree":26, "descr": "aaa","timep": "2020-01-01"}
+{"src":22222,"dst":66666,"degree":25, "descr": "aaa","timep": "2020-01-01"}