diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 0000000..abc225a --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,90 @@ +# Docs for the Azure Web Apps Deploy action: https://github.com/Azure/webapps-deploy +# More GitHub Actions for Azure: https://github.com/Azure/actions + +name: build + +on: + push: + branches: + - main + - aihub + tags: + - v* + pull_request: + branches: + - main + workflow_dispatch: + +env: + REGISTRY: ghcr.io + DOCS_IMAGE_NAME: aihub-prepdocs + CHAT_IMAGE_NAME: aihub-chat + AIHUB_IMAGE_NAME: aihub + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: true + + - name: Set up .NET Core + uses: actions/setup-dotnet@v1 + with: + dotnet-version: '8.x' + include-prerelease: true + + - name: Setup MinVer + run: | + dotnet tool install --global minver-cli --version 4.3.0 + + - name: Calculate Version + run: | + echo "MINVERVERSIONOVERRIDE=$($HOME/.dotnet/tools/minver -t v. -m 1.0 -d preview)" >> $GITHUB_ENV + + - name: Lower case REPO + run: | + echo "GITHUB_REPOSITORY_LOWER_CASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} + + - name: Login to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Extract metadata (tags, labels) for prep-docs Docker + id: meta-docs + uses: docker/metadata-action@v3 + with: + images: ${{ env.REGISTRY }}/${{ env.DOCS_IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: ./scripts/ + file: ./scripts/Dockerfile + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ env.REGISTRY }}/${{ env.GITHUB_REPOSITORY_LOWER_CASE }}/${{ env.DOCS_IMAGE_NAME }}:${{ env.MINVERVERSIONOVERRIDE }} + labels: ${{ steps.meta-docs.outputs.labels }} + + - name: Extract metadata (tags, labels) for chat Docker + id: meta-chat + uses: docker/metadata-action@v3 + with: + images: ${{ env.REGISTRY }}/${{ env.CHAT_IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v3 + with: + context: ./azure-search-openai-demo/app + file: ./azure-search-openai-demo/app/Dockerfile + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ env.REGISTRY }}/${{ env.GITHUB_REPOSITORY_LOWER_CASE }}/${{ env.CHAT_IMAGE_NAME }}:${{ env.MINVERVERSIONOVERRIDE }} + labels: ${{ steps.meta-chat.outputs.labels }} + diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..34dc17f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "azure-search-openai-demo"] + path = azure-search-openai-demo + url = https://github.com/cmendible/azure-search-openai-demo.git + branch = k8s diff --git a/azure-search-openai-demo b/azure-search-openai-demo new file mode 160000 index 0000000..70f770d --- /dev/null +++ b/azure-search-openai-demo @@ -0,0 +1 @@ +Subproject commit 70f770d1f3c6c37ee683fbc0797cbc7ef5d20865 diff --git a/infra/.terraform.lock.hcl b/infra/.terraform.lock.hcl index 3efc788..a4456cd 100644 --- a/infra/.terraform.lock.hcl +++ b/infra/.terraform.lock.hcl @@ -2,97 +2,97 @@ # Manual edits may be lost in future updates. provider "registry.terraform.io/azure/azapi" { - version = "1.9.0" + version = "1.11.0" hashes = [ - "h1:yIJQVdnmGZdvS3yrw0M8ke9KiB/c0tjZ7KUXC46Hjx0=", - "zh:349569471fbf387feaaf8b88da1690669e201147c342f905e5eb03df42b3cf87", - "zh:54346d5fb78cbad3eb7cfd96e1dd7ce4f78666cabaaccfec6ee9437476330018", - "zh:64b799da915ea3a9a58ac7a926c6a31c59fd0d911687804d8e815eda88c5580b", - "zh:9336ed9e112555e0fda8af6be9ba21478e30117d79ba662233311d9560d2b7c6", - "zh:a8aace9897b28ea0b2dbd7a3be3df033e158af40412c9c7670be0956f216ed7e", - "zh:ab23df7de700d9e785009a4ca9ceb38ae1ab894a13f5788847f15d018556f415", - "zh:b4f13f0b13560a67d427c71c85246f8920f98987120341830071df4535842053", - "zh:e58377bf36d8a14d28178a002657865ee17446182dac03525fd43435e41a1b5c", - "zh:ea5db4acc6413fd0fe6b35981e58cdc9850f5f3118031cc3d2581de511aee6aa", - "zh:f0b32c06c6bd4e4af2c02a62be07b947766aeeb09289a03f21aba16c2fd3c60f", - "zh:f1518e766a90c257d7eb36d360dafaf311593a4a9352ff8db0bcfe0ed8cf45ae", - "zh:fa89e84cff0776b5b61ff27049b1d8ed52040bd58c81c4628890d644a6fb2989", + "h1:nxSbPf052jbk91vEmlJ6JxV7AhJzyxRclLQAiDXORek=", + "zh:240ba0f3d87f8faf3171e1dd0ec74bffc868bde84db7fb2c89913c787b11ef07", + "zh:422cfbe039f6041525d55aa0641dfed014d970b516d8de058a1869736682b9d3", + "zh:4be67c64d73eb3c31706d575436179cc6f6b3dece00709e5721b60512031b2f2", + "zh:744b4f68b229c11b3df1198e4ebb4646fa44c14ac5f271337da03917d9fad433", + "zh:86927d43f75a8163c2c947fae8d48a63219865e50df437372ee66378826172a1", + "zh:a44523fad3a806b2ccee2e81ef206ddaab365eacdec213ec2cce2ddd7d4ed731", + "zh:b15c9edac6df2c250ff04f0edae18a9656d19c79c475ef68be5f5c2631059d7e", + "zh:d1365f7fe280c11cc7613b4b47798c1f96271c4bb2eed951d6a994790d0b62d4", + "zh:e7fea9c180f1f2be6e96152a3b4e0beada3aa585c186f1f3de6be6c74ee858fe", + "zh:e8278579b6a18e04a538a1163e257a9be65a3cc35e13a57ea868f179ca03ec28", + "zh:fc8f4eeefb44877965eae59e152d82e347261936d47d4d3c448cde5181164ae3", + "zh:fd76a6fb2819a1ce56454132c775b721c5028e4f24dd264f2897f345cd4b12ee", ] } provider "registry.terraform.io/hashicorp/azuread" { - version = "2.44.0" + version = "2.47.0" hashes = [ - "h1:oAmmXpj7Pb89vRw0ThEpaU6Aio8TmdQHKfrCFWykRDY=", - "zh:06d0061a5ba488a4bdd661117f410f0bce4b103153ddba6854c06a68cf50542b", - "zh:16ec95329d12de191eca3f75e066e9d70730fd7ff7b175fe1c61f247b9aba9e0", + "h1:zYMGokLn44KSWir7Nr4t8lEAPMB6JuXd2LlP2Ac2tMY=", + "zh:1372d81eb24ef3b4b00ea350fe87219f22da51691b8e42ce91d662f6c2a8af5e", "zh:1c3e89cf19118fc07d7b04257251fc9897e722c16e0a0df7b07fcd261f8c12e7", - "zh:82530c85481617fd2d7e2ad7c6a2318da19dd8a04566c3e39c35d69d9270f4c5", - "zh:9879fddc7740e9a68066d820f49e3941b6514cdf72b86162d033a2f9dd490340", - "zh:a152d237396cffd701ae184146f3ff9076f617aa72177e11ad76b828ab9518ca", - "zh:aef2da192bf30856f317525b2a7a02da277089a562ad5a74edf8fc059bd2ab58", - "zh:b142f05ad261afc06bf74a399294df1330c556a69ef9605fc64ae2aa8eebe8e8", - "zh:b177d35dec6dfd2b962e6f4239229645c30d1625c3cafde539a2beab312d6914", - "zh:ce2c8571e0e0bb350abb312fbfc90795663fd64d43c078477cc75031e3078c87", - "zh:e0b1577910276d7cbb11c6002a27f7918a026fd5b5c4170c28a797fe3167a3db", - "zh:f42583d31cd3a77d02fafbf8d744a37c4497ac05554f264b30908b621449731d", + "zh:1e654a74d171d6ff8f9f6f67e3ff1421d4c5e56a18607703626bf12cd23ba001", + "zh:35227fad617a0509c64ab5759a8b703b10d244877f1aa5416bfbcc100c96996f", + "zh:357f553f0d78d46a96c7b2ed06d25ee0fc60fc5be19812ccb5d969fa47d62e17", + "zh:58faa2940065137e3e87d02eba59ab5cd7137d7a18caf225e660d1788f274569", + "zh:7308eda0339620fa24f47cedd22221fc2c02cab9d5be1710c09a783aea84eb3a", + "zh:863eabf7f908a8263e28d8aa2ad1381affd6bb5c67755216781f674ef214100e", + "zh:8b95b595a7c14ed7b56194d03cdec253527e7a146c1c58961be09e6b5c50baee", + "zh:afbca6b4fac9a0a488bc22ff9e51a8f14e986137d25275068fd932f379a51d57", + "zh:c6aadec4c81a44c3ffc22c2d90ffc6706bf5a9a903a395d896477516f4be6cbb", + "zh:e54a59de7d4ef0f3a18f91fed0b54a2bce18257ae2ee1df8a88226e1023c5811", ] } provider "registry.terraform.io/hashicorp/azurerm" { - version = "3.72.0" - constraints = "3.72.0" + version = "3.87.0" + constraints = "3.87.0" hashes = [ - "h1:xogMjHMWY7pwV6ZB8CSmZi0YJ8PLQrE+maLnQQv5x5g=", - "zh:0750326f82dc0765cd9dc0e142b4c325be7918beeacc0b887510274f15d76311", - "zh:10ba452905de646181bfbbb9555c7b8fb96138ddc4bb42227521c402c3b12213", - "zh:25c8198603cffa0920e6ae39a87a5bb4af75bbe1fba36156e8077ae50261a7ca", - "zh:5c294fff683c2fc292f502da43f41bf4b68a20bf60a2e92723768a0ce7fe2c7a", - "zh:84449a0e7d5bd4a3fda9a4c9ad287c4c7ebcc5ede406d3ab7593f073d40abdfc", - "zh:89f3fc2b3e84e45776fce547ed9fa3dbdba65fe243094fe308c5cef273b4d980", - "zh:a8cdfc816fbf14a230c3bb4ccdf70d19069186de78008e49dc9dfaa8aaf0208e", - "zh:d6e1d86f2d6d0e09d3961f10f9e26e24a25d39e98ecaf93d5cb089ddb4fea5b6", - "zh:e74f0e6c3904da8ff10bdb90be1fd8b20f1d3f14d62d24ffb76b61c623ee0e3c", - "zh:e9fb32ef48450b8109e30e47280053ca7d5307190bf6f516e1bebaf556dc8d81", - "zh:ee8c9bb7aa318a3d8a313eb032b3fc1a332114fe112723ba7b0c8cb4a5947476", + "h1:SqFtup3wvbozutkXVF78LylpKL4nGED1cbk2IbLzhAQ=", + "zh:1547ed020fa6ca25d940b28601442c7e4495fdea9fb1ead7affb867383f5f40b", + "zh:325e6d636b5ab09a24837194647617c9fabd42f0fb2c7e18ae8d2a8b2d890a55", + "zh:3abb0074de1dc3b723f8c209354ba93e717ba52003847484b19369e8f54735f4", + "zh:52d2b1700108d5093113a986aa10757834e48083c1358a2c7d8d0360e2689390", + "zh:5fe377d5cc80e26766ff411dbcb227728709fe34b14ad106c9e374df653086a4", + "zh:747fe80de4fb88b17cac93ff05d62909d3563325c8ed5a461641b48579c328f8", + "zh:b40142e4041b7f000ab2dda58309755395a4018d5d00218f6a601f737389865a", + "zh:bca622818c221cec81d636879e376c15696a8d091703298195d728b3c1eae7db", + "zh:bfaecd203137ff9eb3228b1cbd3191e1d84d7c019855eb5f3071bbf6eb060a51", + "zh:d197f04b54f2be07f827ced220954d723039c84793a4ce91894b622982c25811", + "zh:e831601ea1f67c5e745946ed3ac0cac772ed8e95ca7d7314d3f0ed631e6eefb1", "zh:f569b65999264a9416862bca5cd2a6177d94ccb0424f3a4ef424428912b9cb3c", ] } provider "registry.terraform.io/hashicorp/null" { - version = "3.2.1" + version = "3.2.2" hashes = [ - "h1:vUW21lLLsKlxtBf0QF7LKJreKxs0CM7YXGzqW1N/ODY=", - "zh:58ed64389620cc7b82f01332e27723856422820cfd302e304b5f6c3436fb9840", - "zh:62a5cc82c3b2ddef7ef3a6f2fedb7b9b3deff4ab7b414938b08e51d6e8be87cb", - "zh:63cff4de03af983175a7e37e52d4bd89d990be256b16b5c7f919aff5ad485aa5", - "zh:74cb22c6700e48486b7cabefa10b33b801dfcab56f1a6ac9b6624531f3d36ea3", + "h1:m467k2tZ9cdFFgHW7LPBK2GLPH43LC6wc3ppxr8yvoE=", + "zh:3248aae6a2198f3ec8394218d05bd5e42be59f43a3a7c0b71c66ec0df08b69e7", + "zh:32b1aaa1c3013d33c245493f4a65465eab9436b454d250102729321a44c8ab9a", + "zh:38eff7e470acb48f66380a73a5c7cdd76cc9b9c9ba9a7249c7991488abe22fe3", + "zh:4c2f1faee67af104f5f9e711c4574ff4d298afaa8a420680b0cb55d7bbc65606", + "zh:544b33b757c0b954dbb87db83a5ad921edd61f02f1dc86c6186a5ea86465b546", + "zh:696cf785090e1e8cf1587499516b0494f47413b43cb99877ad97f5d0de3dc539", + "zh:6e301f34757b5d265ae44467d95306d61bef5e41930be1365f5a8dcf80f59452", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:79e553aff77f1cfa9012a2218b8238dd672ea5e1b2924775ac9ac24d2a75c238", - "zh:a1e06ddda0b5ac48f7e7c7d59e1ab5a4073bbcf876c73c0299e4610ed53859dc", - "zh:c37a97090f1a82222925d45d84483b2aa702ef7ab66532af6cbcfb567818b970", - "zh:e4453fbebf90c53ca3323a92e7ca0f9961427d2f0ce0d2b65523cc04d5d999c2", - "zh:e80a746921946d8b6761e77305b752ad188da60688cfd2059322875d363be5f5", - "zh:fbdb892d9822ed0e4cb60f2fedbdbb556e4da0d88d3b942ae963ed6ff091e48f", - "zh:fca01a623d90d0cad0843102f9b8b9fe0d3ff8244593bd817f126582b52dd694", + "zh:913a929070c819e59e94bb37a2a253c228f83921136ff4a7aa1a178c7cce5422", + "zh:aa9015926cd152425dbf86d1abdbc74bfe0e1ba3d26b3db35051d7b9ca9f72ae", + "zh:bb04798b016e1e1d49bcc76d62c53b56c88c63d6f2dfe38821afef17c416a0e1", + "zh:c23084e1b23577de22603cff752e59128d83cfecc2e6819edadd8cf7a10af11e", ] } provider "registry.terraform.io/hashicorp/random" { - version = "3.5.1" + version = "3.6.0" hashes = [ - "h1:3hjTP5tQBspPcFAJlfafnWrNrKnr7J4Cp0qB9jbqf30=", - "zh:04e3fbd610cb52c1017d282531364b9c53ef72b6bc533acb2a90671957324a64", - "zh:119197103301ebaf7efb91df8f0b6e0dd31e6ff943d231af35ee1831c599188d", - "zh:4d2b219d09abf3b1bb4df93d399ed156cadd61f44ad3baf5cf2954df2fba0831", - "zh:6130bdde527587bbe2dcaa7150363e96dbc5250ea20154176d82bc69df5d4ce3", - "zh:6cc326cd4000f724d3086ee05587e7710f032f94fc9af35e96a386a1c6f2214f", + "h1:t0mRdJzegohRKhfdoQEJnv3JRISSezJRblN0HIe67vo=", + "zh:03360ed3ecd31e8c5dac9c95fe0858be50f3e9a0d0c654b5e504109c2159287d", + "zh:1c67ac51254ba2a2bb53a25e8ae7e4d076103483f55f39b426ec55e47d1fe211", + "zh:24a17bba7f6d679538ff51b3a2f378cedadede97af8a1db7dad4fd8d6d50f829", + "zh:30ffb297ffd1633175d6545d37c2217e2cef9545a6e03946e514c59c0859b77d", + "zh:454ce4b3dbc73e6775f2f6605d45cee6e16c3872a2e66a2c97993d6e5cbd7055", "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:b6d88e1d28cf2dfa24e9fdcc3efc77adcdc1c3c3b5c7ce503a423efbdd6de57b", - "zh:ba74c592622ecbcef9dc2a4d81ed321c4e44cddf7da799faa324da9bf52a22b2", - "zh:c7c5cde98fe4ef1143bd1b3ec5dc04baf0d4cc3ca2c5c7d40d17c0e9b2076865", - "zh:dac4bad52c940cd0dfc27893507c1e92393846b024c5a9db159a93c534a3da03", - "zh:de8febe2a2acd9ac454b844a4106ed295ae9520ef54dc8ed2faf29f12716b602", - "zh:eab0d0495e7e711cca367f7d4df6e322e6c562fc52151ec931176115b83ed014", + "zh:91df0a9fab329aff2ff4cf26797592eb7a3a90b4a0c04d64ce186654e0cc6e17", + "zh:aa57384b85622a9f7bfb5d4512ca88e61f22a9cea9f30febaa4c98c68ff0dc21", + "zh:c4a3e329ba786ffb6f2b694e1fd41d413a7010f3a53c20b432325a94fa71e839", + "zh:e2699bc9116447f96c53d55f2a00570f982e6f9935038c3810603572693712d0", + "zh:e747c0fd5d7684e5bfad8aa0ca441903f15ae7a98a737ff6aca24ba223207e2c", + "zh:f1ca75f417ce490368f047b63ec09fd003711ae48487fba90b4aba2ccf71920e", ] } diff --git a/infra/README.md b/infra/README.md index 366638a..8b2d9f9 100644 --- a/infra/README.md +++ b/infra/README.md @@ -12,39 +12,4 @@ terraform apply ## Manual steps -> This is temporal - -Clone the GitHub repository [cmendible/azure-search-openai-demo](https://github.com/cmendible/azure-search-openai-demo) and run the following commands to deploy the Azure Search Index and upload the sample documents: - -```bash -git clone https://github.com/cmendible/azure-search-openai-demo.git -cd azure-search-openai-demo -git checkout k8s - -export AZURE_PRINCIPAL_ID="" -export AZURE_RESOURCE_GROUP="" -export AZURE_SUBSCRIPTION_ID="" -export AZURE_TENANT_ID="" -export AZURE_STORAGE_ACCOUNT="" -export AZURE_STORAGE_CONTAINER="content" -export AZURE_SEARCH_SERVICE="" -export OPENAI_HOST="azure" -export AZURE_OPENAI_SERVICE="" -export OPENAI_API_KEY="" -export OPENAI_ORGANIZATION="" -export AZURE_OPENAI_EMB_DEPLOYMENT="text-embedding-ada-002" -export AZURE_OPENAI_EMB_MODEL_NAME="text-embedding-ada-002" -export AZURE_SEARCH_INDEX="gptkbindex" -``` - -Login to Azure: - -```bash -azd auth login --client-id --client-secret --tenant-id -``` - -Deploy the Azure Search Index and upload the sample documents: - -```bash -./scripts/prepdocs.sh -``` \ No newline at end of file +TODO: Describe these steps diff --git a/infra/main.tf b/infra/main.tf index d40e71a..3d26b1e 100644 --- a/infra/main.tf +++ b/infra/main.tf @@ -4,14 +4,30 @@ resource "random_id" "random" { byte_length = 8 } -resource "azurerm_resource_group" "rg" { - name = var.resource_group_name - location = var.location +locals { + sufix = var.use_random_suffix ? substr(lower(random_id.random.hex), 1, 4) : "" + name_sufix = var.use_random_suffix ? "-${local.sufix}" : "" + resource_group_name = "${var.resource_group_name}${local.name_sufix}" + storage_account_name = "${var.storage_account_name}${local.sufix}" + azopenai_name = "${var.azopenai_name}${local.name_sufix}" + content_safety_name = "${var.content_safety_name}${local.name_sufix}" + cognitive_services_name = "${var.cognitive_services_name}${local.name_sufix}" + speech_name = "${var.speech_name}${local.name_sufix}" + bing_name = "${var.bing_name}${local.name_sufix}" + search_name = "${var.search_name}${local.name_sufix}" + form_recognizer_name = "${var.form_recognizer_name}${local.name_sufix}" + apim_name = "${var.apim_name}${local.name_sufix}" + appi_name = "${var.appi_name}${local.name_sufix}" + log_name = "${var.log_name}${local.name_sufix}" + cae_name = "${var.cae_name}${local.name_sufix}" + ca_chat_name = "${var.ca_chat_name}${local.name_sufix}" + ca_prep_docs_name = "${var.ca_prep_docs_name}${local.name_sufix}" + ca_aihub_name = "${var.ca_aihub_name}${local.name_sufix}" } -locals { - name_sufix = substr(lower(random_id.random.hex), 1, 4) - storage_account_name = "${var.storage_account_name}${local.name_sufix}" +resource "azurerm_resource_group" "rg" { + name = local.resource_group_name + location = var.location } module "vnet" { @@ -37,7 +53,7 @@ module "apim" { source = "./modules/apim" location = azurerm_resource_group.rg.location resource_group_name = azurerm_resource_group.rg.name - apim_name = var.apim_name + apim_name = local.apim_name apim_subnet_id = module.vnet.apim_subnet_id publisher_name = var.publisher_name publisher_email = var.publisher_email @@ -68,7 +84,7 @@ module "search" { source = "./modules/search" location = azurerm_resource_group.rg.location resource_group_name = azurerm_resource_group.rg.name - search_name = var.search_name + search_name = local.search_name principal_id = module.mi.principal_id } @@ -76,21 +92,21 @@ module "form_recognizer" { source = "./modules/form" location = azurerm_resource_group.rg.location resource_group_name = azurerm_resource_group.rg.name - form_recognizer_name = var.form_recognizer_name + form_recognizer_name = local.form_recognizer_name } module "log" { source = "./modules/log" location = azurerm_resource_group.rg.location resource_group_name = azurerm_resource_group.rg.name - log_name = var.log_name + log_name = local.log_name } module "appi" { source = "./modules/appi" location = azurerm_resource_group.rg.location resource_group_name = azurerm_resource_group.rg.name - appi_name = var.appi_name + appi_name = local.appi_name log_id = module.log.log_id } @@ -106,27 +122,38 @@ module "openai" { source = "./modules/openai" location = azurerm_resource_group.rg.location resource_group_name = azurerm_resource_group.rg.name - secondary_location = var.secondary_location - azopenai_name = var.azopenai_name + azopenai_name = local.azopenai_name principal_id = module.mi.principal_id } +module "cog" { + source = "./modules/cog" + location = azurerm_resource_group.rg.location + resource_group_name = azurerm_resource_group.rg.name + resource_group_id = azurerm_resource_group.rg.id + principal_id = module.mi.principal_id + bing_name = local.bing_name + cognitive_services_name = local.cognitive_services_name + content_safety_name = local.content_safety_name + speech_name = local.speech_name +} + module "cae" { source = "./modules/cae" location = azurerm_resource_group.rg.location resource_group_id = azurerm_resource_group.rg.id - cae_name = var.cae_name + cae_name = local.cae_name cae_subnet_id = module.vnet.cae_subnet_id log_workspace_id = module.log.log_workspace_id log_key = module.log.log_key appi_key = module.appi.appi_key } -module "ca_back" { - source = "./modules/ca-back" +module "ca_chat" { + source = "./modules/ca-chat" location = azurerm_resource_group.rg.location resource_group_id = azurerm_resource_group.rg.id - ca_name = var.ca_back_name + ca_name = local.ca_chat_name cae_id = module.cae.cae_id managed_identity_id = module.mi.mi_id chat_gpt_deployment = module.openai.gpt_deployment_name @@ -137,42 +164,57 @@ module "ca_back" { storage_container_name = module.st.storage_container_name search_service_name = module.search.search_service_name search_index_name = module.search.search_index_name - openai_service_name = var.enable_apim ? module.apim.gateway_url : module.openai.openai_endpoint + openai_endpoint = var.enable_apim ? module.apim.gateway_url : module.openai.openai_endpoint tenant_id = data.azurerm_subscription.current.tenant_id managed_identity_client_id = module.mi.client_id enable_entra_id_authentication = var.enable_entra_id_authentication } -# module "ca_webapi" { -# source = "./modules/ca-webapi" -# location = azurerm_resource_group.rg.location -# resource_group_id = azurerm_resource_group.rg.id -# ca_name = var.ca_webapi_name -# cae_id = module.cae.cae_id -# cae_default_domain = module.cae.defaultDomain -# ca_webapp_name = var.ca_webapp_name -# managed_identity_id = module.mi.mi_id -# chat_gpt_deployment = module.openai.gpt_deployment_name -# chat_gpt_model = module.openai.gpt_deployment_name -# embeddings_deployment = module.openai.embedding_deployment_name -# embeddings_model = module.openai.embedding_deployment_name -# storage_account_name = module.st.storage_account_name -# storage_container_name = module.st.storage_container_name -# search_service_name = module.search.search_service_name -# search_index_name = module.search.search_index_name -# openai_service_name = module.openai.openai_service_name -# tenant_id = data.azurerm_subscription.current.tenant_id -# managed_identity_client_id = module.mi.client_id -# } - -# module "ca_webapp" { -# source = "./modules/ca-webapp" -# location = azurerm_resource_group.rg.location -# resource_group_id = azurerm_resource_group.rg.id -# ca_name = var.ca_webapp_name -# cae_id = module.cae.cae_id -# managed_identity_id = module.mi.mi_id -# tenant_id = data.azurerm_subscription.current.tenant_id -# managed_identity_client_id = module.mi.client_id -# backend_url = module.ca_webapi.fqdn -# } +module "ca_prep_docs" { + source = "./modules/ca-prep-docs" + location = azurerm_resource_group.rg.location + resource_group_id = azurerm_resource_group.rg.id + ca_name = local.ca_prep_docs_name + cae_id = module.cae.cae_id + managed_identity_id = module.mi.mi_id + storage_account_name = module.st.storage_account_name + storage_account_key = module.st.key + search_service_name = module.search.search_service_name + tenant_id = data.azurerm_subscription.current.tenant_id + managed_identity_client_id = module.mi.client_id + openai_service_name = module.openai.openai_service_name + resource_group_name = azurerm_resource_group.rg.name + subscription_id = data.azurerm_subscription.current.subscription_id +} + +module "ca_aihub" { + source = "./modules/ca-aihub" + location = azurerm_resource_group.rg.location + resource_group_id = azurerm_resource_group.rg.id + ca_name = local.ca_aihub_name + cae_id = module.cae.cae_id + managed_identity_id = module.mi.mi_id + chat_gpt_deployment = module.openai.gpt_deployment_name + chat_gpt_model = module.openai.gpt_deployment_name + embeddings_deployment = module.openai.embedding_deployment_name + embeddings_model = module.openai.embedding_deployment_name + storage_account_name = module.st.storage_account_name + storage_container_name = module.st.storage_container_name + search_service_name = module.search.search_service_name + search_index_name = module.search.search_index_name + openai_endpoint = var.enable_apim ? module.apim.gateway_url : module.openai.openai_endpoint + chat_fqdn = module.ca_chat.fqdn + pbi_report_link = var.pbi_report_link + content_safety_endpoint = module.cog.content_safety_endpoint + content_safety_key = module.cog.content_safety_key + cognitive_service_endpoint = module.cog.cognitive_service_endpoint + cognitive_service_key = module.cog.cognitive_service_key + speech_key = module.cog.speech_key + + storage_connection_string = module.st.connection_string + bing_key = var.bing_key + + tenant_id = data.azurerm_subscription.current.tenant_id + managed_identity_client_id = module.mi.client_id + enable_entra_id_authentication = var.enable_entra_id_authentication +} diff --git a/infra/modules/apim/main.tf b/infra/modules/apim/main.tf index d283524..7f09a66 100644 --- a/infra/modules/apim/main.tf +++ b/infra/modules/apim/main.tf @@ -3,6 +3,18 @@ locals { backend_url = "${var.openai_service_endpoint}openai" } +resource "azurerm_public_ip" "apim_public_ip" { + count = var.enable_apim ? 1 : 0 + name = "pip-apim" + location = var.location + resource_group_name = var.resource_group_name + allocation_method = "Static" + sku = "Standard" + ip_tags = {} + zones = ["1", "2", "3"] + domain_name_label = var.apim_name +} + resource "azurerm_api_management" "apim" { count = var.enable_apim ? 1 : 0 name = var.apim_name @@ -12,6 +24,7 @@ resource "azurerm_api_management" "apim" { publisher_email = var.publisher_email sku_name = "Developer_1" virtual_network_type = "External" # Use "Internal" for a fully private APIM + public_ip_address_id = azurerm_public_ip.apim_public_ip[0].id // Required to deploy APIM in STv2 platform virtual_network_configuration { subnet_id = var.apim_subnet_id @@ -31,7 +44,6 @@ resource "azurerm_api_management_backend" "openai" { } } -// TODO: https://learn.microsoft.com/en-us/azure/api-management/api-management-howto-log-event-hubs?tabs=bicep#logger-with-system-assigned-managed-identity-credentialss resource "azurerm_api_management_logger" "appi_logger" { count = var.enable_apim ? 1 : 0 name = local.logger_name diff --git a/infra/modules/ca-back/auth_config.tf b/infra/modules/ca-aihub/auth_config.tf similarity index 93% rename from infra/modules/ca-back/auth_config.tf rename to infra/modules/ca-aihub/auth_config.tf index d5fee17..f487b6f 100644 --- a/infra/modules/ca-back/auth_config.tf +++ b/infra/modules/ca-aihub/auth_config.tf @@ -62,7 +62,6 @@ resource "null_resource" "update_redirect_uris" { azapi_resource.current ] triggers = { - input_json = md5(local.update_redirect_uris_command) - update_redirect_uris_command = local.update_redirect_uris_command + always_run = timestamp() } } diff --git a/infra/modules/ca-aihub/main.tf b/infra/modules/ca-aihub/main.tf new file mode 100644 index 0000000..5159ea8 --- /dev/null +++ b/infra/modules/ca-aihub/main.tf @@ -0,0 +1,208 @@ +resource "azapi_resource" "ca_back" { + name = var.ca_name + location = var.location + parent_id = var.resource_group_id + type = "Microsoft.App/containerApps@2022-11-01-preview" + identity { + type = "UserAssigned" + identity_ids = [ + var.managed_identity_id + ] + } + + body = jsonencode({ + properties : { + managedEnvironmentId = "${var.cae_id}" + configuration = { + secrets = [ + { + name = "microsoft-provider-authentication-secret" + value = "${var.enable_entra_id_authentication ? module.sp[0].password : "None"}" + }, + { + name = "content-safety-key" + value = "${var.content_safety_key}" + }, + { + name = "cognitive-service-key", + value = "${var.cognitive_service_key}" + }, + { + name = "speech-key", + value = "${var.speech_key}" + }, + { + name = "storage-connection-string" + value = "${var.storage_connection_string}" + }, + { + name = "bing-key" + value = "${var.bing_key}" + } + ] + ingress = { + external = true + targetPort = 8080 + transport = "Http" + + traffic = [ + { + latestRevision = true + weight = 100 + } + ] + } + dapr = { + enabled = false + } + } + template = { + containers = [ + { + name = "aihub" + image = "ghcr.io/azure/aihub/aihub:1.0.0-preview.0" + resources = { + cpu = 0.5 + memory = "1Gi" + } + env = [ + { + name = "Logging__LogLevel__Default", + value = "Information" + }, + { + name = "Logging__LogLevel__Microsoft.AspNetCore", + value = "Warning" + }, + { + name = "ContentModerator__Endpoint", + value = "${var.content_safety_endpoint}" + }, + { + name = "ContentModerator__SubscriptionKey", + secretRef = "content-safety-key" + }, + { + name = "BrandAnalyzer__BingEndpoint", + value = "https://api.bing.microsoft.com/v7.0/search" + }, + { + name = "BrandAnalyzer__BingKey", + secretRef = "bing-key" + }, + { + name = "BrandAnalyzer__OpenAIEndpoint", + value = "${var.openai_endpoint}" + }, + { + name = "BrandAnalyzer__OpenAISubscriptionKey", + value = "" + }, + { + name = "CallCenter__OpenAIEndpoint", + value = "${var.openai_endpoint}" + }, + { + name = "CallCenter__OpenAISubscriptionKey", + value = "" + }, + { + name = "ImageAnalyzer__VisionEndpoint", + value = "${var.cognitive_service_endpoint}computervision/imageanalysis:analyze?api-version=2023-02-01-preview&features=denseCaptions&language=en&gender-neutral-caption=False" + }, + { + name = "ImageAnalyzer__OCREndpoint", + value = "${var.cognitive_service_endpoint}computervision/imageanalysis:analyze?api-version=2023-02-01-preview&features=read&gender-neutral-caption=False" + }, + { + name = "ImageAnalyzer__VisionSubscriptionKey", + secretRef = "cognitive-service-key" + }, + { + name = "ImageAnalyzer__OpenAIEndpoint", + value = "${var.openai_endpoint}" + }, + { + name = "ImageAnalyzer__OpenAISubscriptionKey", + value = "" + }, + { + name = "ImageAnalyzer__ContainerName", + value = "image-analyzer" + }, + { + name = "FormAnalyzer__FormRecogEndpoint", + value = "${var.cognitive_service_endpoint}formrecognizer/documentModels/prebuilt-layout:analyze?api-version=2023-07-31" + }, + { + name = "FormAnalyzer__FormRecogSubscriptionKey", + secretRef = "cognitive-service-key" + }, + { + name = "FormAnalyzer__OpenAIEndpoint", + value = "${var.openai_endpoint}" + }, + { + name = "FormAnalyzer__OpenAISubscriptionKey", + value = "" + }, + { + name = "FormAnalyzer__ContainerName", + value = "form-analyzer" + }, + { + name = "Storage__ConnectionString", + secretRef = "storage-connection-string" + }, + { + name = "Storage__ContainerName", + value = "image-moderator" + }, + { + name = "AudioTranscription__SpeechLocation", + value = "${var.location}" + }, + { + name = "AudioTranscription__SpeechSubscriptionKey", + secretRef = "speech-key" + }, + { + name = "AudioTranscription__ContainerName", + value = "audio-files" + }, + { + name = "ChatOnYourData__Link", + value = "https://${var.chat_fqdn}" + }, + { + name = "PBIReport__Link", + value = "${var.pbi_report_link}" + }, + { + name = "AllowedHosts", + value = "*" + }, + { + name = "AZURE_TENANT_ID" + value = "${var.tenant_id}" + }, + { + name = "AZURE_CLIENT_ID" + value = "${var.managed_identity_client_id}" + }, + { + name = "ASPNETCORE_ENVIRONMENT" + value = "Development" + } + ], + }, + ] + scale = { + minReplicas = 1 + maxReplicas = 1 + } + } + } + }) + response_export_values = ["properties.configuration.ingress.fqdn"] +} diff --git a/infra/modules/ca-back/outputs.tf b/infra/modules/ca-aihub/outputs.tf similarity index 100% rename from infra/modules/ca-back/outputs.tf rename to infra/modules/ca-aihub/outputs.tf diff --git a/infra/modules/ca-webapi/providers.tf b/infra/modules/ca-aihub/providers.tf similarity index 92% rename from infra/modules/ca-webapi/providers.tf rename to infra/modules/ca-aihub/providers.tf index 30260c4..c4fa93c 100644 --- a/infra/modules/ca-webapi/providers.tf +++ b/infra/modules/ca-aihub/providers.tf @@ -3,7 +3,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.72.0" + version = "3.87.0" } azapi = { source = "Azure/azapi" diff --git a/infra/modules/ca-webapi/variables.tf b/infra/modules/ca-aihub/variables.tf similarity index 55% rename from infra/modules/ca-webapi/variables.tf rename to infra/modules/ca-aihub/variables.tf index 54e5b95..8c684cc 100644 --- a/infra/modules/ca-webapi/variables.tf +++ b/infra/modules/ca-aihub/variables.tf @@ -2,13 +2,10 @@ variable "resource_group_id" {} variable "location" {} variable "ca_name" {} variable "cae_id" {} -variable "cae_default_domain" {} variable "managed_identity_id" {} variable "managed_identity_client_id" {} variable "tenant_id" {} -variable "ca_webapp_name" {} - variable "storage_account_name" {} variable "storage_container_name" {} variable "search_service_name" {} @@ -17,4 +14,17 @@ variable "chat_gpt_deployment" {} variable "chat_gpt_model" {} variable "embeddings_deployment" {} variable "embeddings_model" {} -variable "openai_service_name" {} +variable "openai_endpoint" {} + +variable "chat_fqdn" {} +variable "pbi_report_link" {} +variable "content_safety_endpoint" {} +variable "content_safety_key" {} +variable "cognitive_service_endpoint" {} +variable "cognitive_service_key" {} +variable "speech_key" {} + +variable "storage_connection_string" {} +variable "bing_key" {} + +variable "enable_entra_id_authentication" {} diff --git a/infra/modules/ca-chat/auth_config.tf b/infra/modules/ca-chat/auth_config.tf new file mode 100644 index 0000000..f487b6f --- /dev/null +++ b/infra/modules/ca-chat/auth_config.tf @@ -0,0 +1,67 @@ +locals { + redirect_fqdn = jsondecode(azapi_resource.ca_back.output).properties.configuration.ingress.fqdn +} + +module "sp" { + count = var.enable_entra_id_authentication ? 1 : 0 + source = "../sp" + sp_name = var.ca_name +} + +resource "azapi_resource" "current" { + count = var.enable_entra_id_authentication ? 1 : 0 + type = "Microsoft.App/containerApps/authConfigs@2023-05-01" + name = "Current" + parent_id = azapi_resource.ca_back.id + timeouts {} + body = jsonencode({ + properties = { + platform = { + enabled = true + } + globalValidation = { + redirectToProvider = "azureactivedirectory" + unauthenticatedClientAction = "RedirectToLoginPage" + } + identityProviders = { + azureActiveDirectory = { + enabled = true + isAutoProvisioned = true + registration = { + clientId = "${module.sp[0].client_id}" + clientSecretSettingName = "microsoft-provider-authentication-secret" + openIdIssuer = "https://sts.windows.net/${var.tenant_id}/v2.0" + } + validation = { + allowedAudiences = [ + "api://${module.sp[0].client_id}" + ] + } + } + } + login = { + preserveUrlFragmentsForLogins = false + } + } + }) +} + +locals { + fqdn = jsondecode(azapi_resource.ca_back.output).properties.configuration.ingress.fqdn + update_redirect_uris_command = var.enable_entra_id_authentication ? "az ad app update --id ${module.sp[0].client_id} --web-redirect-uris https://${local.fqdn}/.auth/login/aad/callback" : "" +} + +resource "null_resource" "update_redirect_uris" { + count = var.enable_entra_id_authentication ? 1 : 0 + provisioner "local-exec" { + command = local.update_redirect_uris_command + } + depends_on = [ + module.sp, + azapi_resource.ca_back, + azapi_resource.current + ] + triggers = { + always_run = timestamp() + } +} diff --git a/infra/modules/ca-back/main.tf b/infra/modules/ca-chat/main.tf similarity index 98% rename from infra/modules/ca-back/main.tf rename to infra/modules/ca-chat/main.tf index 65f491e..a1c251e 100644 --- a/infra/modules/ca-back/main.tf +++ b/infra/modules/ca-chat/main.tf @@ -80,7 +80,7 @@ resource "azapi_resource" "ca_back" { }, { name = "AZURE_OPENAI_SERVICE" - value = "${var.openai_service_name}" + value = "${var.openai_endpoint}" }, { name = "AZURE_TENANT_ID" diff --git a/infra/modules/ca-chat/outputs.tf b/infra/modules/ca-chat/outputs.tf new file mode 100644 index 0000000..c0e0a25 --- /dev/null +++ b/infra/modules/ca-chat/outputs.tf @@ -0,0 +1,4 @@ +output "fqdn" { + value = jsondecode(azapi_resource.ca_back.output).properties.configuration.ingress.fqdn +} + diff --git a/infra/modules/ca-webapp/providers.tf b/infra/modules/ca-chat/providers.tf similarity index 92% rename from infra/modules/ca-webapp/providers.tf rename to infra/modules/ca-chat/providers.tf index 30260c4..c4fa93c 100644 --- a/infra/modules/ca-webapp/providers.tf +++ b/infra/modules/ca-chat/providers.tf @@ -3,7 +3,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.72.0" + version = "3.87.0" } azapi = { source = "Azure/azapi" diff --git a/infra/modules/ca-back/variables.tf b/infra/modules/ca-chat/variables.tf similarity index 93% rename from infra/modules/ca-back/variables.tf rename to infra/modules/ca-chat/variables.tf index 99bfdb4..6ddb039 100644 --- a/infra/modules/ca-back/variables.tf +++ b/infra/modules/ca-chat/variables.tf @@ -14,6 +14,6 @@ variable "chat_gpt_deployment" {} variable "chat_gpt_model" {} variable "embeddings_deployment" {} variable "embeddings_model" {} -variable "openai_service_name" {} +variable "openai_endpoint" {} variable "enable_entra_id_authentication" {} diff --git a/infra/modules/ca-prep-docs/main.tf b/infra/modules/ca-prep-docs/main.tf new file mode 100644 index 0000000..8baf5f1 --- /dev/null +++ b/infra/modules/ca-prep-docs/main.tf @@ -0,0 +1,121 @@ +resource "azapi_resource" "ca_back" { + name = var.ca_name + location = var.location + parent_id = var.resource_group_id + type = "Microsoft.App/jobs@2023-05-01" + identity { + type = "UserAssigned" + identity_ids = [ + var.managed_identity_id + ] + } + + body = jsonencode({ + properties : { + environmentId = "${var.cae_id}" + configuration = { + manualTriggerConfig = { + parallelism = 1 + replicaCompletionCount = 1 + } + secrets = [] + triggerType = "Manual" + replicaTimeout = 3600 + replicaRetryLimit = 0 + } + template = { + containers = [ + { + name = "aihub-prepdocs" + image = "ghcr.io/azure/activate-genai/aihub-prepdocs:1.0.0-preview.0" + resources = { + cpu = 0.5 + memory = "1Gi" + } + env = [ + { + name = "OPENAI_HOST" + value = "azure" + }, + { + name = "AZURE_OPENAI_EMB_DEPLOYMENT" + value = "text-embedding-ada-002" + }, + { + name = "AZURE_OPENAI_EMB_MODEL_NAME" + value = "text-embedding-ada-002" + }, + { + name = "AZURE_STORAGE_CONTAINER" + value = "content" + }, + { + name = "AZURE_SEARCH_INDEX" + value = "gptkbindex" + }, + { + name = "OPENAI_API_KEY" + value = "" + }, + { + name = "OPENAI_ORGANIZATION" + value = "" + }, + { + name = "AZURE_RESOURCE_GROUP" + value = "${var.resource_group_name}" + }, + { + name = "AZURE_SUBSCRIPTION_ID" + value = "${var.subscription_id}" + }, + { + name = "AZURE_STORAGE_ACCOUNT" + value = "${var.storage_account_name}" + }, + { + name = "AZURE_SEARCH_SERVICE" + value = "${var.search_service_name}" + }, + { + name = "AZURE_OPENAI_SERVICE" + value = "${var.openai_service_name}" + }, + { + name = "AZURE_TENANT_ID" + value = "${var.tenant_id}" + }, + { + name = "AZURE_CLIENT_ID" + value = "${var.managed_identity_client_id}" + }, + ], + volumeMounts = [ + { + volumeName = "staging-volume" + mountPath = "/data" + } + ] + }, + ] + volumes = [ + { + name = "staging-volume" + storageName = "${azurerm_container_app_environment_storage.data.name}" + storageType = "AzureFile" + } + ] + } + } + }) + response_export_values = ["properties.configuration.ingress.fqdn"] +} + +resource "azurerm_container_app_environment_storage" "data" { + name = "stagingstorage" + container_app_environment_id = var.cae_id + account_name = var.storage_account_name + share_name = "staging" + access_key = var.storage_account_key + access_mode = "ReadWrite" +} diff --git a/infra/modules/ca-back/providers.tf b/infra/modules/ca-prep-docs/providers.tf similarity index 92% rename from infra/modules/ca-back/providers.tf rename to infra/modules/ca-prep-docs/providers.tf index 30260c4..c4fa93c 100644 --- a/infra/modules/ca-back/providers.tf +++ b/infra/modules/ca-prep-docs/providers.tf @@ -3,7 +3,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.72.0" + version = "3.87.0" } azapi = { source = "Azure/azapi" diff --git a/infra/modules/ca-prep-docs/variables.tf b/infra/modules/ca-prep-docs/variables.tf new file mode 100644 index 0000000..7f3f075 --- /dev/null +++ b/infra/modules/ca-prep-docs/variables.tf @@ -0,0 +1,13 @@ +variable "resource_group_name" {} +variable "resource_group_id" {} +variable "subscription_id" {} +variable "location" {} +variable "ca_name" {} +variable "cae_id" {} +variable "managed_identity_id" {} +variable "managed_identity_client_id" {} +variable "tenant_id" {} +variable "storage_account_name" {} +variable "storage_account_key" {} +variable "search_service_name" {} +variable "openai_service_name" {} diff --git a/infra/modules/ca-webapi/main.tf b/infra/modules/ca-webapi/main.tf deleted file mode 100644 index 0386e29..0000000 --- a/infra/modules/ca-webapi/main.tf +++ /dev/null @@ -1,102 +0,0 @@ -resource "azapi_resource" "ca_webapi" { - name = var.ca_name - location = var.location - parent_id = var.resource_group_id - type = "Microsoft.App/containerApps@2022-11-01-preview" - identity { - type = "UserAssigned" - identity_ids = [ - var.managed_identity_id - ] - } - - body = jsonencode({ - properties : { - managedEnvironmentId = "${var.cae_id}" - configuration = { - secrets = [] - ingress = { - external = true - targetPort = 8080 - transport = "Http" - - traffic = [ - { - latestRevision = true - weight = 100 - } - ] - corsPolicy = { - allowedOrigins = [ - "https://${var.ca_webapp_name}.${var.cae_default_domain}" - ] - allowedHeaders = ["*"] - allowCredentials = false - } - } - dapr = { - enabled = false - } - } - template = { - containers = [ - { - name = "chat-copilot-webapi" - image = "cmendibl3/chat-copilot-webapi:0.1.0" - resources = { - cpu = 0.5 - memory = "1Gi" - } - env = [ - { - name = "Authentication__Type" - value = "None" - }, - { - name = "Planner__Model" - value = "${var.chat_gpt_model}" - }, - { - name = "SemanticMemory__Services__AzureOpenAIText__Endpoint" - value = "https://${var.openai_service_name}.openai.azure.com/" - }, - { - name = "SemanticMemory__Services__AzureOpenAIText__Deployment" - value = "${var.chat_gpt_model}" - }, - { - name = "SemanticMemory__Services__AzureOpenAIText__Auth" - value = "AzureIdentity" - }, - { - name = "SemanticMemory__Services__AzureOpenAIEmbedding__Endpoint" - value = "https://${var.openai_service_name}.openai.azure.com/" - }, - { - name = "SemanticMemory__Services__AzureOpenAIEmbedding__Deployment" - value = "${var.embeddings_model}" - }, - { - name = "SemanticMemory__Services__AzureOpenAIEmbedding__Auth" - value = "AzureIdentity" - }, - { - name = "AZURE_TENANT_ID" - value = "${var.tenant_id}" - }, - { - name = "AZURE_CLIENT_ID" - value = "${var.managed_identity_client_id}" - }, - ], - }, - ] - scale = { - minReplicas = 1 - maxReplicas = 1 - } - } - } - }) - response_export_values = ["properties.configuration.ingress.fqdn"] -} diff --git a/infra/modules/ca-webapi/outputs.tf b/infra/modules/ca-webapi/outputs.tf deleted file mode 100644 index 6266cdb..0000000 --- a/infra/modules/ca-webapi/outputs.tf +++ /dev/null @@ -1,4 +0,0 @@ -output "fqdn" { - value = jsondecode(azapi_resource.ca_webapi.output).properties.configuration.ingress.fqdn -} - diff --git a/infra/modules/ca-webapp/main.tf b/infra/modules/ca-webapp/main.tf deleted file mode 100644 index 52c491d..0000000 --- a/infra/modules/ca-webapp/main.tf +++ /dev/null @@ -1,59 +0,0 @@ -resource "azapi_resource" "ca_webapp" { - name = var.ca_name - location = var.location - parent_id = var.resource_group_id - type = "Microsoft.App/containerApps@2022-11-01-preview" - identity { - type = "UserAssigned" - identity_ids = [ - var.managed_identity_id - ] - } - - body = jsonencode({ - properties : { - managedEnvironmentId = "${var.cae_id}" - configuration = { - secrets = [] - ingress = { - external = true - targetPort = 3000 - transport = "Http" - - traffic = [ - { - latestRevision = true - weight = 100 - } - ] - } - dapr = { - enabled = false - } - } - template = { - containers = [ - { - name = "chat-copilot-webapp" - image = "cmendibl3/chat-copilot-webapp:0.1.0" - resources = { - cpu = 0.5 - memory = "1Gi" - } - env = [ - { - name = "REACT_APP_BACKEND_URI" - value = "${var.backend_url}" - }, - ], - }, - ] - scale = { - minReplicas = 1 - maxReplicas = 1 - } - } - } - }) - response_export_values = ["properties.configuration.ingress.fqdn"] -} diff --git a/infra/modules/ca-webapp/outputs.tf b/infra/modules/ca-webapp/outputs.tf deleted file mode 100644 index 83b5bbb..0000000 --- a/infra/modules/ca-webapp/outputs.tf +++ /dev/null @@ -1,3 +0,0 @@ -output "fqdn" { - value = jsondecode(azapi_resource.ca_webapp.output).properties.configuration.ingress.fqdn -} diff --git a/infra/modules/ca-webapp/variables.tf b/infra/modules/ca-webapp/variables.tf deleted file mode 100644 index d2c65e3..0000000 --- a/infra/modules/ca-webapp/variables.tf +++ /dev/null @@ -1,9 +0,0 @@ -variable "resource_group_id" {} -variable "location" {} -variable "ca_name" {} -variable "cae_id" {} -variable "managed_identity_id" {} -variable "managed_identity_client_id" {} -variable "tenant_id" {} - -variable "backend_url" {} \ No newline at end of file diff --git a/infra/modules/cae/providers.tf b/infra/modules/cae/providers.tf index 30260c4..c4fa93c 100644 --- a/infra/modules/cae/providers.tf +++ b/infra/modules/cae/providers.tf @@ -3,7 +3,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.72.0" + version = "3.87.0" } azapi = { source = "Azure/azapi" diff --git a/infra/modules/cog/main.tf b/infra/modules/cog/main.tf new file mode 100644 index 0000000..14b283d --- /dev/null +++ b/infra/modules/cog/main.tf @@ -0,0 +1,56 @@ +resource "azurerm_cognitive_account" "content_safety" { + name = var.content_safety_name + kind = "ContentSafety" + sku_name = "S0" + location = var.location + resource_group_name = var.resource_group_name + public_network_access_enabled = true + custom_subdomain_name = var.content_safety_name +} + +resource "azurerm_cognitive_account" "cognitive" { + name = var.cognitive_services_name + kind = "CognitiveServices" + sku_name = "S0" + location = var.location + resource_group_name = var.resource_group_name + public_network_access_enabled = true + custom_subdomain_name = var.cognitive_services_name +} + +resource "azurerm_cognitive_account" "speech" { + name = var.speech_name + kind = "SpeechServices" + sku_name = "S0" + location = var.location + resource_group_name = var.resource_group_name + public_network_access_enabled = true + custom_subdomain_name = var.speech_name +} + +resource "azapi_resource" "bing" { + name = var.bing_name + location = "global" + parent_id = var.resource_group_id + type = "Microsoft.Bing/accounts@2020-06-10" + schema_validation_enabled = false // Required for this service otherwise it will fail. + + body = jsonencode({ + kind = "Bing.Search.v7" + sku = { + name = "S1" + } + properties : { + statisticsEnabled = true + } + }) + response_export_values = ["properties.endpoint"] +} + + + +# resource "azurerm_role_assignment" "openai_user" { +# scope = azurerm_cognitive_account.openai.id +# role_definition_name = "Cognitive Services OpenAI User" +# principal_id = var.principal_id +# } diff --git a/infra/modules/cog/outputs.tf b/infra/modules/cog/outputs.tf new file mode 100644 index 0000000..5103240 --- /dev/null +++ b/infra/modules/cog/outputs.tf @@ -0,0 +1,19 @@ +output "content_safety_endpoint" { + value = azurerm_cognitive_account.content_safety.endpoint +} + +output "content_safety_key" { + value = azurerm_cognitive_account.content_safety.primary_access_key +} + +output "cognitive_service_endpoint" { + value = azurerm_cognitive_account.cognitive.endpoint +} + +output "cognitive_service_key" { + value = azurerm_cognitive_account.cognitive.primary_access_key +} + +output "speech_key" { + value = azurerm_cognitive_account.speech.primary_access_key +} diff --git a/infra/modules/cog/providers.tf b/infra/modules/cog/providers.tf new file mode 100644 index 0000000..c4fa93c --- /dev/null +++ b/infra/modules/cog/providers.tf @@ -0,0 +1,20 @@ +terraform { + required_version = ">= 1.1.8" + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = "3.87.0" + } + azapi = { + source = "Azure/azapi" + } + } +} + +provider "azurerm" { + features { + cognitive_account { + purge_soft_delete_on_destroy = true + } + } +} \ No newline at end of file diff --git a/infra/modules/cog/variables.tf b/infra/modules/cog/variables.tf new file mode 100644 index 0000000..4f81339 --- /dev/null +++ b/infra/modules/cog/variables.tf @@ -0,0 +1,8 @@ +variable "resource_group_name" {} +variable "resource_group_id" {} +variable "location" {} +variable "principal_id" {} +variable "content_safety_name" {} +variable "cognitive_services_name" {} +variable "speech_name" {} +variable "bing_name" {} diff --git a/infra/modules/nsg/nsg_apim.tf b/infra/modules/nsg/nsg_apim.tf index cc3481d..9c72085 100644 --- a/infra/modules/nsg/nsg_apim.tf +++ b/infra/modules/nsg/nsg_apim.tf @@ -3,13 +3,26 @@ resource "azurerm_network_security_group" "nsg_apim" { location = var.location resource_group_name = var.resource_group_name + # External Only security_rule { - name = "management-endpoint" + name = "allowanyhttpsinbound" priority = 100 direction = "Inbound" access = "Allow" protocol = "Tcp" source_port_range = "*" + destination_port_range = "443" + source_address_prefix = "Internet" + destination_address_prefix = "VirtualNetwork" + } + + security_rule { + name = "management-endpoint" + priority = 200 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" destination_port_range = "3443" source_address_prefix = "ApiManagement" destination_address_prefix = "VirtualNetwork" @@ -17,7 +30,7 @@ resource "azurerm_network_security_group" "nsg_apim" { security_rule { name = "load-balancer" - priority = 200 + priority = 210 direction = "Inbound" access = "Allow" protocol = "Tcp" @@ -28,14 +41,14 @@ resource "azurerm_network_security_group" "nsg_apim" { } security_rule { - name = "allowanyhttpsinbound" - priority = 310 + name = "traffic-manager" + priority = 220 direction = "Inbound" access = "Allow" protocol = "Tcp" source_port_range = "*" destination_port_range = "443" - source_address_prefix = "*" + source_address_prefix = "AzureTrafficManager" destination_address_prefix = "VirtualNetwork" } @@ -74,6 +87,18 @@ resource "azurerm_network_security_group" "nsg_apim" { source_address_prefix = "VirtualNetwork" destination_address_prefix = "AzureKeyVault" } + + security_rule { + name = "publish-diagnostcs" + priority = 400 + direction = "Outbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_ranges = ["1886", "443"] + source_address_prefix = "VirtualNetwork" + destination_address_prefix = "AzureMonitor" + } } resource "azurerm_subnet_network_security_group_association" "nsg_apim_association" { diff --git a/infra/modules/openai/main.tf b/infra/modules/openai/main.tf index f26a5a0..862ca12 100644 --- a/infra/modules/openai/main.tf +++ b/infra/modules/openai/main.tf @@ -24,6 +24,22 @@ resource "azurerm_cognitive_deployment" "gpt_35_turbo" { } } +resource "azurerm_cognitive_deployment" "demo_build" { + name = "DemoBuild" + cognitive_account_id = azurerm_cognitive_account.openai.id + rai_policy_name = "Microsoft.Default" + model { + format = "OpenAI" + name = "gpt-35-turbo" + version = "0301" + } + + scale { + type = "Standard" + capacity = 120 + } +} + resource "azurerm_cognitive_deployment" "embedding" { name = "text-embedding-ada-002" cognitive_account_id = azurerm_cognitive_account.openai.id @@ -46,44 +62,3 @@ resource "azurerm_role_assignment" "openai_user" { principal_id = var.principal_id } -# resource "azurerm_cognitive_account" "secondary_openai" { -# name = var.azopenai_name -# kind = "OpenAI" -# sku_name = "S0" -# location = var.secondary_location -# resource_group_name = var.resource_group_name -# public_network_access_enabled = true -# custom_subdomain_name = var.azopenai_name -# } - -# resource "azurerm_cognitive_deployment" "secondary_gpt_35_turbo" { -# name = "gpt-35-turbo" -# cognitive_account_id = azurerm_cognitive_account.secondary_openai.id -# rai_policy_name = "Microsoft.Default" -# model { -# format = "OpenAI" -# name = "gpt-35-turbo" -# version = "0301" -# } - -# scale { -# type = "Standard" -# capacity = 120 -# } -# } - -# resource "azurerm_cognitive_deployment" "secondary_embedding" { -# name = "text-embedding-ada-002" -# cognitive_account_id = azurerm_cognitive_account.secondary_openai.id -# rai_policy_name = "Microsoft.Default" -# model { -# format = "OpenAI" -# name = "text-embedding-ada-002" -# version = "2" -# } - -# scale { -# type = "Standard" -# capacity = 239 -# } -# } diff --git a/infra/modules/openai/variables.tf b/infra/modules/openai/variables.tf index bc34a0a..ba92b39 100644 --- a/infra/modules/openai/variables.tf +++ b/infra/modules/openai/variables.tf @@ -1,5 +1,4 @@ variable "resource_group_name" {} variable "location" {} -variable "secondary_location" {} variable "azopenai_name" {} variable "principal_id" {} diff --git a/infra/modules/search/main.tf b/infra/modules/search/main.tf index 2d2770b..19b6c7b 100644 --- a/infra/modules/search/main.tf +++ b/infra/modules/search/main.tf @@ -3,6 +3,7 @@ resource "azurerm_search_service" "search" { location = var.location resource_group_name = var.resource_group_name sku = "standard" + semantic_search_sku = "free" local_authentication_enabled = false } @@ -23,4 +24,4 @@ resource "azurerm_role_assignment" "search_service_contributor" { scope = azurerm_search_service.search.id role_definition_name = "Search Service Contributor" principal_id = var.principal_id -} \ No newline at end of file +} diff --git a/infra/modules/sp/main.tf b/infra/modules/sp/main.tf index a47f547..e833e7a 100644 --- a/infra/modules/sp/main.tf +++ b/infra/modules/sp/main.tf @@ -1,5 +1,8 @@ data "azurerm_client_config" "current" {} +resource "random_uuid" "uuid" { +} + resource "azuread_application" "sp" { display_name = var.sp_name identifier_uris = ["api://${var.sp_name}"] @@ -22,7 +25,7 @@ resource "azuread_application" "sp" { admin_consent_description = "Allow the application to access example on behalf of the signed-in user." admin_consent_display_name = "Allow the application to access example on behalf of the signed-in user." enabled = true - id = "96183846-204b-4b43-82e1-5d2222eb4b9b" + id = random_uuid.uuid.result type = "User" user_consent_description = "Allow the application to access example on your behalf." user_consent_display_name = "Allow the application to access example on behalf of the signed-in user." diff --git a/infra/modules/sp/outputs.tf b/infra/modules/sp/outputs.tf index 112ddc9..b6c3a43 100644 --- a/infra/modules/sp/outputs.tf +++ b/infra/modules/sp/outputs.tf @@ -13,3 +13,7 @@ output "id" { output "password" { value = azuread_service_principal_password.sp.value } + +output "redirect_uris_count" { + value = length(azuread_service_principal.sp.redirect_uris) +} diff --git a/infra/modules/st/main.tf b/infra/modules/st/main.tf index c655611..1a390c1 100644 --- a/infra/modules/st/main.tf +++ b/infra/modules/st/main.tf @@ -23,8 +23,46 @@ resource "azurerm_storage_container" "content" { storage_account_name = azurerm_storage_account.sa.name } -resource "azurerm_role_assignment" "storage_reader" { +resource "azurerm_storage_container" "audio" { + name = "audio-files" + container_access_type = "private" + storage_account_name = azurerm_storage_account.sa.name +} + +resource "azurerm_storage_container" "form-analyzer" { + name = "form-analyzer" + container_access_type = "private" + storage_account_name = azurerm_storage_account.sa.name +} + +resource "azurerm_storage_container" "image-analyzer" { + name = "image-analyzer" + container_access_type = "private" + storage_account_name = azurerm_storage_account.sa.name +} + +resource "azurerm_storage_container" "image-moderator" { + name = "image-moderator" + container_access_type = "private" + storage_account_name = azurerm_storage_account.sa.name +} + +resource "azurerm_storage_share" "share" { + name = "staging" + storage_account_name = azurerm_storage_account.sa.name + quota = 5 +} + +resource "azurerm_storage_share_file" "docs" { + for_each = fileset("${path.module}/../../../azure-search-openai-demo/data", "*") + name = each.value + storage_share_id = azurerm_storage_share.share.id + source = "${path.module}/../.../../azure-search-openai-demo/data/${each.value}" +} + +resource "azurerm_role_assignment" "storage_contributor" { scope = azurerm_storage_account.sa.id - role_definition_name = "Storage Blob Data Reader" + role_definition_name = "Storage Blob Data Contributor" principal_id = var.principal_id } + diff --git a/infra/modules/st/outputs.tf b/infra/modules/st/outputs.tf index 4cc981f..a229fdc 100644 --- a/infra/modules/st/outputs.tf +++ b/infra/modules/st/outputs.tf @@ -4,4 +4,12 @@ output "storage_account_name" { output "storage_container_name" { value = azurerm_storage_container.content.name -} \ No newline at end of file +} + +output "connection_string" { + value = azurerm_storage_account.sa.primary_connection_string +} + +output "key" { + value = azurerm_storage_account.sa.primary_access_key +} diff --git a/infra/providers.tf b/infra/providers.tf index 30260c4..c4fa93c 100644 --- a/infra/providers.tf +++ b/infra/providers.tf @@ -3,7 +3,7 @@ terraform { required_providers { azurerm = { source = "hashicorp/azurerm" - version = "3.72.0" + version = "3.87.0" } azapi = { source = "Azure/azapi" diff --git a/infra/variables.tf b/infra/variables.tf index fb61ea2..7b46d19 100644 --- a/infra/variables.tf +++ b/infra/variables.tf @@ -3,11 +3,7 @@ variable "resource_group_name" { } variable "location" { - default = "West Europe" -} - -variable "secondary_location" { - default = "North Europe" + default = "westeurope" } variable "log_name" { @@ -18,6 +14,22 @@ variable "azopenai_name" { default = "cog-openai-activate-genai" } +variable "content_safety_name" { + default = "cog-content-safety-activate-genai" +} + +variable "cognitive_services_name" { + default = "cog-cognitive-activate-genai" +} + +variable "speech_name" { + default = "cog-speech-activate-genai" +} + +variable "bing_name" { + default = "cog-bing-activate-genai" +} + variable "search_name" { default = "srch-activate-genai" } @@ -30,10 +42,6 @@ variable "storage_account_name" { default = "stgenai" } -variable "eventhub_name" { - default = "evh-activate-genai" -} - variable "apim_name" { default = "apim-activate-genai" } @@ -45,6 +53,7 @@ variable "appi_name" { variable "publisher_name" { default = "contoso" } + variable "publisher_email" { default = "admin@contoso.com" } @@ -61,22 +70,34 @@ variable "cae_name" { default = "cae-activate-genai" } -variable "ca_back_name" { - default = "ca-back-activate-genai" +variable "ca_chat_name" { + default = "ca-chat-activate-genai" +} + +variable "ca_prep_docs_name" { + default = "ca-prep-docs-activate-genai" } -variable "ca_webapi_name" { - default = "ca-webapi-activate-genai" +variable "ca_aihub_name" { + default = "ca-aihub-activate-genai" } -variable "ca_webapp_name" { - default = "ca-webapp-activate-genai" +variable "use_random_suffix" { + default = true } variable "enable_entra_id_authentication" { - default = false + default = true } variable "enable_apim" { - default = false + default = true +} + +variable "bing_key" { + default = "" +} + +variable "pbi_report_link" { + default = "" } diff --git a/scripts/Dockerfile b/scripts/Dockerfile new file mode 100644 index 0000000..48ca968 --- /dev/null +++ b/scripts/Dockerfile @@ -0,0 +1,11 @@ +FROM mcr.microsoft.com/devcontainers/python:3.10 + +RUN python3 -m venv ./scripts/.venv + +COPY ./ ./scripts/ + +RUN chmod +x ./scripts/prepdocs.sh + +RUN ./scripts/.venv/bin/python -m pip install -r ./scripts/requirements.txt + +ENTRYPOINT ["./scripts/prepdocs.sh"] diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py new file mode 100644 index 0000000..ad76beb --- /dev/null +++ b/scripts/prepdocs.py @@ -0,0 +1,638 @@ +import argparse +import base64 +import glob +import html +import io +import os +import re +import time + +import openai +import tiktoken +from azure.ai.formrecognizer import DocumentAnalysisClient +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.indexes.models import ( + HnswParameters, + PrioritizedFields, + SearchableField, + SearchField, + SearchFieldDataType, + SearchIndex, + SemanticConfiguration, + SemanticField, + SemanticSettings, + SimpleField, + VectorSearch, + VectorSearchAlgorithmConfiguration, +) +from azure.storage.blob import BlobServiceClient +from pypdf import PdfReader, PdfWriter +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_random_exponential, +) + +args = argparse.Namespace(verbose=True, openaihost="azure") + +MAX_SECTION_LENGTH = 1000 +SENTENCE_SEARCH_LIMIT = 100 +SECTION_OVERLAP = 100 + +open_ai_token_cache = {} +CACHE_KEY_TOKEN_CRED = "openai_token_cred" +CACHE_KEY_CREATED_TIME = "created_time" +CACHE_KEY_TOKEN_TYPE = "token_type" + +# Embedding batch support section +SUPPORTED_BATCH_AOAI_MODEL = {"text-embedding-ada-002": {"token_limit": 8100, "max_batch_size": 16}} + + +def calculate_tokens_emb_aoai(input: str): + encoding = tiktoken.encoding_for_model(args.openaimodelname) + return len(encoding.encode(input)) + + +def blob_name_from_file_page(filename, page=0): + if os.path.splitext(filename)[1].lower() == ".pdf": + return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf" + else: + return os.path.basename(filename) + + +def upload_blobs(filename): + blob_service = BlobServiceClient( + account_url=f"https://{args.storageaccount}.blob.core.windows.net", credential=storage_creds + ) + blob_container = blob_service.get_container_client(args.container) + if not blob_container.exists(): + blob_container.create_container() + + # if file is PDF split into pages and upload each page as a separate blob + if os.path.splitext(filename)[1].lower() == ".pdf": + reader = PdfReader(filename) + pages = reader.pages + for i in range(len(pages)): + blob_name = blob_name_from_file_page(filename, i) + if args.verbose: + print(f"\tUploading blob for page {i} -> {blob_name}") + f = io.BytesIO() + writer = PdfWriter() + writer.add_page(pages[i]) + writer.write(f) + f.seek(0) + blob_container.upload_blob(blob_name, f, overwrite=True) + else: + blob_name = blob_name_from_file_page(filename) + with open(filename, "rb") as data: + blob_container.upload_blob(blob_name, data, overwrite=True) + + +def remove_blobs(filename): + if args.verbose: + print(f"Removing blobs for '{filename or ''}'") + blob_service = BlobServiceClient( + account_url=f"https://{args.storageaccount}.blob.core.windows.net", credential=storage_creds + ) + blob_container = blob_service.get_container_client(args.container) + if blob_container.exists(): + if filename is None: + blobs = blob_container.list_blob_names() + else: + prefix = os.path.splitext(os.path.basename(filename))[0] + blobs = filter( + lambda b: re.match(f"{prefix}-\d+\.pdf", b), + blob_container.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0]), + ) + for b in blobs: + if args.verbose: + print(f"\tRemoving blob {b}") + blob_container.delete_blob(b) + + +def table_to_html(table): + table_html = "" + rows = [ + sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) + for i in range(table.row_count) + ] + for row_cells in rows: + table_html += "" + for cell in row_cells: + tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td" + cell_spans = "" + if cell.column_span > 1: + cell_spans += f" colSpan={cell.column_span}" + if cell.row_span > 1: + cell_spans += f" rowSpan={cell.row_span}" + table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}" + table_html += "" + table_html += "
" + return table_html + + +def get_document_text(filename): + offset = 0 + page_map = [] + if args.localpdfparser: + reader = PdfReader(filename) + pages = reader.pages + for page_num, p in enumerate(pages): + page_text = p.extract_text() + page_map.append((page_num, offset, page_text)) + offset += len(page_text) + else: + if args.verbose: + print(f"Extracting text from '{filename}' using Azure Form Recognizer") + form_recognizer_client = DocumentAnalysisClient( + endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/", + credential=formrecognizer_creds, + headers={"x-ms-useragent": "azure-search-chat-demo/1.0.0"}, + ) + with open(filename, "rb") as f: + poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document=f) + form_recognizer_results = poller.result() + + for page_num, page in enumerate(form_recognizer_results.pages): + tables_on_page = [ + table + for table in form_recognizer_results.tables + if table.bounding_regions[0].page_number == page_num + 1 + ] + + # mark all positions of the table spans in the page + page_offset = page.spans[0].offset + page_length = page.spans[0].length + table_chars = [-1] * page_length + for table_id, table in enumerate(tables_on_page): + for span in table.spans: + # replace all table spans with "table_id" in table_chars array + for i in range(span.length): + idx = span.offset - page_offset + i + if idx >= 0 and idx < page_length: + table_chars[idx] = table_id + + # build page text by replacing characters in table spans with table html + page_text = "" + added_tables = set() + for idx, table_id in enumerate(table_chars): + if table_id == -1: + page_text += form_recognizer_results.content[page_offset + idx] + elif table_id not in added_tables: + page_text += table_to_html(tables_on_page[table_id]) + added_tables.add(table_id) + + page_text += " " + page_map.append((page_num, offset, page_text)) + offset += len(page_text) + + return page_map + + +def split_text(page_map, filename): + SENTENCE_ENDINGS = [".", "!", "?"] + WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"] + if args.verbose: + print(f"Splitting '{filename}' into sections") + + def find_page(offset): + num_pages = len(page_map) + for i in range(num_pages - 1): + if offset >= page_map[i][1] and offset < page_map[i + 1][1]: + return i + return num_pages - 1 + + all_text = "".join(p[2] for p in page_map) + length = len(all_text) + start = 0 + end = length + while start + SECTION_OVERLAP < length: + last_word = -1 + end = start + MAX_SECTION_LENGTH + + if end > length: + end = length + else: + # Try to find the end of the sentence + while ( + end < length + and (end - start - MAX_SECTION_LENGTH) < SENTENCE_SEARCH_LIMIT + and all_text[end] not in SENTENCE_ENDINGS + ): + if all_text[end] in WORDS_BREAKS: + last_word = end + end += 1 + if end < length and all_text[end] not in SENTENCE_ENDINGS and last_word > 0: + end = last_word # Fall back to at least keeping a whole word + if end < length: + end += 1 + + # Try to find the start of the sentence or at least a whole word boundary + last_word = -1 + while ( + start > 0 + and start > end - MAX_SECTION_LENGTH - 2 * SENTENCE_SEARCH_LIMIT + and all_text[start] not in SENTENCE_ENDINGS + ): + if all_text[start] in WORDS_BREAKS: + last_word = start + start -= 1 + if all_text[start] not in SENTENCE_ENDINGS and last_word > 0: + start = last_word + if start > 0: + start += 1 + + section_text = all_text[start:end] + yield (section_text, find_page(start)) + + last_table_start = section_text.rfind(" 2 * SENTENCE_SEARCH_LIMIT and last_table_start > section_text.rfind(" 0: + results = search_client.upload_documents(documents=batch) + succeeded = sum([1 for r in results if r.succeeded]) + if args.verbose: + print(f"\tIndexed {len(results)} sections, {succeeded} succeeded") + + +def remove_from_index(filename): + if args.verbose: + print(f"Removing sections from '{filename or ''}' from search index '{args.index}'") + search_client = SearchClient( + endpoint=f"https://{args.searchservice}.search.windows.net/", index_name=args.index, credential=search_creds + ) + while True: + filter = None if filename is None else f"sourcefile eq '{os.path.basename(filename)}'" + r = search_client.search("", filter=filter, top=1000, include_total_count=True) + if r.get_count() == 0: + break + r = search_client.delete_documents(documents=[{"id": d["id"]} for d in r]) + if args.verbose: + print(f"\tRemoved {len(r)} sections from index") + # It can take a few seconds for search results to reflect changes, so wait a bit + time.sleep(2) + + +def refresh_openai_token(): + """ + Refresh OpenAI token every 5 minutes + """ + if ( + CACHE_KEY_TOKEN_TYPE in open_ai_token_cache + and open_ai_token_cache[CACHE_KEY_TOKEN_TYPE] == "azure_ad" + and open_ai_token_cache[CACHE_KEY_CREATED_TIME] + 300 < time.time() + ): + token_cred = open_ai_token_cache[CACHE_KEY_TOKEN_CRED] + openai.api_key = token_cred.get_token("https://cognitiveservices.azure.com/.default").token + open_ai_token_cache[CACHE_KEY_CREATED_TIME] = time.time() + + +def read_files( + path_pattern: str, + use_vectors: bool, + vectors_batch_support: bool, + embedding_deployment: str = None, + embedding_model: str = None, +): + """ + Recursively read directory structure under `path_pattern` + and execute indexing for the individual files + """ + for filename in glob.glob(path_pattern): + if args.verbose: + print(f"Processing '{filename}'") + if args.remove: + remove_blobs(filename) + remove_from_index(filename) + else: + if os.path.isdir(filename): + read_files(filename + "/*", use_vectors, vectors_batch_support) + continue + try: + if not args.skipblobs: + upload_blobs(filename) + page_map = get_document_text(filename) + sections = create_sections( + os.path.basename(filename), + page_map, + use_vectors and not vectors_batch_support, + embedding_deployment, + embedding_model, + ) + if use_vectors and vectors_batch_support: + sections = update_embeddings_in_batch(sections) + index_sections(os.path.basename(filename), sections) + except Exception as e: + print(f"\tGot an error while reading {filename} -> {e} --> skipping file") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", + epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v", + ) + parser.add_argument("files", help="Files to be processed") + parser.add_argument( + "--category", help="Value for the category field in the search index for all sections indexed in this run" + ) + parser.add_argument( + "--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage" + ) + parser.add_argument("--storageaccount", help="Azure Blob Storage account name") + parser.add_argument("--container", help="Azure Blob Storage container name") + parser.add_argument( + "--storagekey", + required=False, + help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)", + ) + parser.add_argument( + "--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)" + ) + parser.add_argument( + "--searchservice", + help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)", + ) + parser.add_argument( + "--index", + help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)", + ) + parser.add_argument( + "--searchkey", + required=False, + help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)", + ) + parser.add_argument("--openaihost", help="Host of the API used to compute embeddings ('azure' or 'openai')") + parser.add_argument("--openaiservice", help="Name of the Azure OpenAI service used to compute embeddings") + parser.add_argument( + "--openaideployment", + help="Name of the Azure OpenAI model deployment for an embedding model ('text-embedding-ada-002' recommended)", + ) + parser.add_argument( + "--openaimodelname", help="Name of the Azure OpenAI embedding model ('text-embedding-ada-002' recommended)" + ) + parser.add_argument( + "--novectors", + action="store_true", + help="Don't compute embeddings for the sections (e.g. don't call the OpenAI embeddings API during indexing)", + ) + parser.add_argument( + "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections" + ) + parser.add_argument( + "--openaikey", + required=False, + help="Optional. Use this Azure OpenAI account key instead of the current user identity to login (use az login to set current user for Azure). This is required only when using non-Azure endpoints.", + ) + parser.add_argument("--openaiorg", required=False, help="This is required only when using non-Azure endpoints.") + parser.add_argument( + "--remove", + action="store_true", + help="Remove references to this document from blob storage and the search index", + ) + parser.add_argument( + "--removeall", + action="store_true", + help="Remove all blobs from blob storage and documents from the search index", + ) + parser.add_argument( + "--localpdfparser", + action="store_true", + help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents", + ) + parser.add_argument( + "--formrecognizerservice", + required=False, + help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)", + ) + parser.add_argument( + "--formrecognizerkey", + required=False, + help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)", + ) + + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + args = parser.parse_args() + + # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them + azd_credential = DefaultAzureCredential() + default_creds = azd_credential if args.searchkey is None or args.storagekey is None else None + search_creds = default_creds if args.searchkey is None else AzureKeyCredential(args.searchkey) + use_vectors = not args.novectors + compute_vectors_in_batch = not args.disablebatchvectors and args.openaimodelname in SUPPORTED_BATCH_AOAI_MODEL + + if not args.skipblobs: + storage_creds = default_creds if args.storagekey is None else args.storagekey + if not args.localpdfparser: + # check if Azure Form Recognizer credentials are provided + if args.formrecognizerservice is None: + print( + "Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser." + ) + exit(1) + formrecognizer_creds = ( + default_creds if args.formrecognizerkey is None else AzureKeyCredential(args.formrecognizerkey) + ) + + if use_vectors: + if args.openaihost == "azure": + if not args.openaikey: + openai.api_key = azd_credential.get_token("https://cognitiveservices.azure.com/.default").token + openai.api_type = "azure_ad" + open_ai_token_cache[CACHE_KEY_CREATED_TIME] = time.time() + open_ai_token_cache[CACHE_KEY_TOKEN_CRED] = azd_credential + open_ai_token_cache[CACHE_KEY_TOKEN_TYPE] = "azure_ad" + else: + openai.api_key = args.openaikey + openai.api_type = "azure" + openai.api_base = f"https://{args.openaiservice}.openai.azure.com" + openai.api_version = "2023-05-15" + else: + print("using normal openai") + openai.api_key = args.openaikey + openai.organization = args.openaiorg + openai.api_type = "openai" + + if args.removeall: + remove_blobs(None) + remove_from_index(None) + else: + if not args.remove: + create_search_index() + + print("Processing files...") + read_files(args.files, use_vectors, compute_vectors_in_batch, args.openaideployment, args.openaimodelname) diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh new file mode 100644 index 0000000..d937b08 --- /dev/null +++ b/scripts/prepdocs.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +echo 'Running "prepdocs.py"' +./scripts/.venv/bin/python ./scripts/prepdocs.py '/data/*' --storageaccount "$AZURE_STORAGE_ACCOUNT" --container "$AZURE_STORAGE_CONTAINER" --searchservice "$AZURE_SEARCH_SERVICE" --openaihost "$OPENAI_HOST" --openaiservice "$AZURE_OPENAI_SERVICE" --openaikey "$OPENAI_API_KEY" --openaiorg "$OPENAI_ORGANIZATION" --openaideployment "$AZURE_OPENAI_EMB_DEPLOYMENT" --openaimodelname "$AZURE_OPENAI_EMB_MODEL_NAME" --index "$AZURE_SEARCH_INDEX" --formrecognizerservice "$AZURE_FORMRECOGNIZER_SERVICE" --tenantid "$AZURE_TENANT_ID" --localpdfparser -v diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..2d22cfc --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,8 @@ +pypdf==3.9.0 +azure-identity==1.13.0 +azure-search-documents==11.4.0b6 +azure-ai-formrecognizer==3.2.1 +azure-storage-blob==12.14.1 +openai[datalib]==0.27.8 +tiktoken==0.4.0 +tenacity==8.2.2 \ No newline at end of file