How to build raw interactions table using Data API
This document describes the process of reconstructing the Raw Interactions table using data retrieved from the Data API.
The Data API provides raw data in a normalized format, split across multiple related tables. To recreate the unified Raw Interactions view, these tables must be joined and transformed to align with the structure and semantics of the original dataset.
In this exercise, we download the raw data into an Excel file and perform the necessary transformations within Excel to build the final view.
Step - 1 : Fetching and storing data into CSV files.
The Data API distributes raw data across five key tables: Conversations, Interactions, Plugin Calls, Plugin Resources, and Users.
We use a Python script to fetch data from the past 24 hours and generate an Excel file containing all five tables.
You can copy the script into your environment and execute it. Be sure to replace the placeholder for the API key with your actual Data API credentials before running the script.

Python script
import requests
import pandas as pd
import time
import logging
from pathlib import Path
import re
# Config
BASE_API = 'https://api.moveworks.ai/export/v1beta2/records'
ENDPOINTS = [
'interactions',
'conversations',
'plugin-calls',
'plugin-resources',
'users'
]
ACCESS_TOKEN = '<Access Token>' # Replace this
# Filter time window (applied to all endpoints)
TIME_FILTER = "last_updated_time ge '2025-07-15T00:00:00.000Z' and last_updated_time le '2025-07-16T00:00:00.000Z'"
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Headers
headers = {
'Authorization': f'Bearer {ACCESS_TOKEN}',
}
# Retry config
MAX_RETRIES = 5
# Function to clean illegal characters
def clean_illegal_chars(df):
def remove_illegal_chars(value):
if isinstance(value, str):
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', value)
return value
return df.applymap(remove_illegal_chars)
# Flatten nested fields
def flatten_record(record):
flat = record.copy()
if isinstance(flat.get("detail"), dict):
detail = flat.pop("detail", {}) or {}
for key, value in detail.items():
if isinstance(value, list):
value = ','.join(map(str, value))
flat[f'detail_{key}'] = value
return flat
# Collect data per endpoint
all_dataframes = {}
def fetch_data(endpoint):
url = f'{BASE_API}/{endpoint}'
params = {
'$orderby': 'id desc',
'$filter': TIME_FILTER,
}
data = []
retries = 0
page_count = 0
logging.info(f"Fetching data from /{endpoint}")
while url:
try:
logging.info(f"Requesting page {page_count + 1}: {url}")
response = requests.get(url, headers=headers, params=params if url.endswith(endpoint) else None)
if response.status_code == 200:
json_resp = response.json()
page_data = json_resp.get('value', [])
logging.info(f"Retrieved {len(page_data)} records from page {page_count + 1}")
if not page_data:
break
for record in page_data:
data.append(flatten_record(record))
url = json_resp.get('@odata.nextLink')
retries = 0
page_count += 1
if endpoint == 'users':
time.sleep(2)
elif response.status_code == 429:
wait = 90 if endpoint == 'users' else 60
logging.warning(f"Rate limited on /{endpoint}. Waiting for {wait} seconds.")
time.sleep(wait)
elif response.status_code in (500, 502, 503, 504):
if retries < MAX_RETRIES:
wait = 2 ** retries
logging.warning(f"Transient error {response.status_code} on /{endpoint}. Retrying in {wait} seconds...")
time.sleep(wait)
retries += 1
else:
logging.error(f"Max retries reached on /{endpoint}. Aborting.")
break
else:
logging.error(f"Unexpected error {response.status_code} on /{endpoint}: {response.text}")
break
except Exception as e:
logging.exception(f"Exception occurred while calling /{endpoint}")
break
if data:
df = pd.DataFrame(data)
df = clean_illegal_chars(df)
all_dataframes[endpoint] = df
logging.info(f"Ingestion complete for /{endpoint}, {len(df)} records.")
else:
logging.info(f"No data retrieved from /{endpoint}.")
# Run ingestion for all endpoints
for endpoint in ENDPOINTS:
fetch_data(endpoint)
# Save to Excel with one sheet per endpoint
if all_dataframes:
output_file = Path("moveworks_export.xlsx")
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
for endpoint, df in all_dataframes.items():
sheet_name = endpoint[:31] # Excel sheet name max limit
df.to_excel(writer, sheet_name=sheet_name, index=False)
logging.info(f"All data written to {output_file}")
else:
logging.warning("No data to write to Excel.")

Step - 2 : Modification of the excel file to generate the raw interactions view.
Let’s learn the sequence of columns shown in raw interactions table and understand where this data is stored in Data API
Raw interactions table | Definition | Data API table |
---|---|---|
Timestamp | This the created time for user interaction | Interactions |
Conversation ID | Conversation ID attributed to an interaction stored | Interactions |
Conversation Domain | This is detected domain on a conversation level | Conversations |
Conversation Topic | All topics aggregated for interactions based on the conversation ID | Interactions |
Interaction Type | Type of interaction | Interactions |
Interaction content | User message or action information | Interactions |
Bot Response | Summarized response provided by the AI Assistant | Interactions |
Unsuccessful Plugins | Plugins called by the AI Assistant but were not served and not used | Plugin calls |
Plugin Served | Plugins that the reasoner considered and served to the user | Plugin calls |
Plugin Used | Plugins that the user interacted or engaged with | Plugin calls |
Resource Domain | Resource domain configured during setting up the ingestion. | Plugin resource |
No of citations | Total no of citations provided in the summarized response | Plugin resource |
Content Item name | Name of the content item used to generate the summarized response | Plugin resource |
Content Item ID | External resource ID of the content item | Plugin resource |
Ticket type | Indicates who filed the ticket
| Plugin resource |
Ticket ID | Unique identifier for a ticket | Plugin resource |
Interaction surface | Platform where the interaction took place | Interactions |
User Department | Department to which the end user belongs to | |
User Location | Location to which the end user belongs to | |
User country | Country to which the end user belongs to | |
User Preferred language | Default language configured in the AI Assistant | Users |
Now that we have identified the columns required to reconstruct the Raw Interactions view, let’s begin working through the raw data and progressively merge the interactions and conversations tables to form a unified view. Please note that we will be using Excel functions throughout this process.
-
Bring in the base columns — Actor, Conversation ID, Created Time, Interaction ID, Interaction Type
a. From the interactions sheet, extract the relevant details into a new sheet. We are only interested in user-led interactions.
Use the following formula to filter this data:
=FILTER(interactions!A:D, interactions!A:A="user")
-
Using the Conversation ID, fetch Conversation Domain and Conversation Topic
a. To fetch the Conversation Domain, use the
VLOOKUP
function to pull data from the conversations sheet:=VLOOKUP(B4,conversations!B:E,3,FALSE)
b. To recreate the Conversation Topic, aggregate all topics detected at the interaction level and replicate them for each interaction.
Use this formula:
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER(interactions!P:P, interactions!B:B= B2)))
-
Fetch additional content fields from the Interactions table
a. To extract Interaction Content, pull values from
detail_content
,detail_detail
, or similar columns, and combine them based on data availability. Use the following nestedIF
structure:=IF(INDEX(interactions!K:K, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!K:K, MATCH(D1294, interactions!D:D, 0)), IF(INDEX(interactions!N:N, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!N:N, MATCH(D1294, interactions!D:D, 0)), IF(INDEX(interactions!L:L, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!L:L, MATCH(D1294, interactions!D:D, 0)), IF(INDEX(interactions!Q:Q, MATCH(D1294, interactions!D:D, 0))<>"", INDEX(interactions!Q:Q, MATCH(D1294, interactions!D:D, 0)), ""))))
b. To retrieve the Bot Response corresponding to each user interaction, use the parent interaction ID to index into bot-led interactions:
=INDEX(interactions!K:K, MATCH(D2, interactions!G:G, 0))
-
Determine plugins involved in an interaction using the Plugin Calls table
Retrieve three plugin-related columns using the formulas below:
a. Unsuccessful plugin:
=TEXTJOIN(", ", TRUE,FILTER('plugin-calls'!F:F,('plugin-calls'!D:D=D10) *('plugin-calls'!H:H =FALSE) *('plugin-calls'!I:I=FALSE)))
b. Plugin served:
=TEXTJOIN(", ", TRUE,FILTER('plugin-calls'!F:F,('plugin-calls'!D:D=D10) *('plugin-calls'!H:H =TRUE)))
c. Plugin used:
=TEXTJOIN(", ", TRUE,FILTER('plugin-calls'!F:F,('plugin-calls'!D:D=D10) *('plugin-calls'!H:H =TRUE) *('plugin-calls'!I:I=TRUE)))
-
Add plugin resource information
Use the plugin-resources table to fetch citation and content-related metadata:
a. Resource Domain: Indicates the configured domain for knowledge or ticket-based interactions.
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!K:K, 'plugin-resources'!E:E = D2)))
b. Number of Citations: Counts distinct resources cited per interaction.
=IFERROR(COUNTA(UNIQUE(FILTER('plugin-resources'!D:D, 'plugin-resources'!E:E = D2))), 0)
Note: Even when there's a #CALC! error, Excel may return 1. Use the Content Item ID column to clean up any incorrect entries.
c. Content Item ID:
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!D:D, 'plugin-resources'!E:E = D2)))
Filter out rows returning
#CALC!
. For such rows, mark No of Citations as 1 and Content Item ID as "Nil".d. Content Item Name:
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!M:M, 'plugin-resources'!E:E = D2)))
-
Populate ticketing information (only applicable when a ticket is created)
a. Ticket Type: Indicates whether a ticket was user-initiated or bot-initiated.
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!S:S, 'plugin-resources'!E:E = D2)))
b. Ticket ID: Filter the values to include only rows where the resource type is "ticket".
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER('plugin-resources'!H:H,('plugin-resources'!E:E = D4)*('plugin-resources'!I:I="RESOURCE_TYPE_TICKET"))))
-
Fetch the platform where the interaction took place
This can be extracted from the interactions sheet based on Interaction ID:
=TEXTJOIN(", ", TRUE, UNIQUE(FILTER(interactions!H:H, interactions!D:D = D2)))
-
Get the user’s preferred language from the Users table
Use this formula to fetch preferred language based on user ID:
=TEXTJOIN(", ",TRUE,UNIQUE(FILTER(users!J:J,users!F:F=F3)))
Additional Notes:
Fields like User Department, User Location, and User Country are not directly available from the Data API. These attributes must be sourced from your organization’s external identity systems.
The User API provides an external ID
which can be used to enrich user metadata by integrating with external identity providers such as Okta. You can refer to Okta List Users for details. Once this data is pulled from the external system, join it using the external ID to populate the missing attributes.
The modifications shown here are done in excel and we are also providing a python script which can create a direct raw interactions table based on the time input provided to the script
import requests
import pandas as pd
import time
import logging
from pathlib import Path
import re
from datetime import datetime, timedelta
# Config
BASE_API = 'https://api.moveworks.ai/export/v1beta2/records'
ENDPOINTS = [
'interactions',
'conversations',
'plugin-calls',
'plugin-resources',
'users'
]
ACCESS_TOKEN = '<Access Token>' # Replace this
# Time filter will be set dynamically based on user input
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Headers
headers = {
'Authorization': f'Bearer {ACCESS_TOKEN}',
}
# Retry config
MAX_RETRIES = 5
def get_user_date_inputs():
"""Get start and end dates from user input and convert to timestamp format"""
print("\n" + "="*50)
print("RAW INTERACTIONS TABLE BUILDER")
print("="*50)
print("Enter the date range for data extraction:")
print("Date format: YYYY-MM-DD (e.g., 2025-07-15)")
print("-"*50)
while True:
try:
start_date_str = input("Enter start date (YYYY-MM-DD): ").strip()
start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
break
except ValueError:
print("❌ Invalid date format. Please use YYYY-MM-DD format (e.g., 2025-07-15)")
while True:
try:
end_date_str = input("Enter end date (YYYY-MM-DD): ").strip()
end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
if end_date < start_date:
print("❌ End date cannot be earlier than start date. Please try again.")
continue
break
except ValueError:
print("❌ Invalid date format. Please use YYYY-MM-DD format (e.g., 2025-07-15)")
# Convert to timestamp format for API
start_timestamp = start_date.strftime("%Y-%m-%dT00:00:00.000Z")
# For end date, set to end of day
end_timestamp = (end_date + timedelta(days=1) - timedelta(microseconds=1)).strftime("%Y-%m-%dT23:59:59.999Z")
time_filter = f"last_updated_time ge '{start_timestamp}' and last_updated_time le '{end_timestamp}'"
print(f"\n✅ Date range set:")
print(f" Start: {start_date_str} 00:00:00 UTC")
print(f" End: {end_date_str} 23:59:59 UTC")
print(f" Filter: {time_filter}")
print("-"*50)
return time_filter, start_date_str, end_date_str
"""Clean illegal characters from DataFrame"""
def remove_illegal_chars(value):
if isinstance(value, str):
return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', value)
return value
return df.applymap(remove_illegal_chars)
def flatten_record(record):
"""Flatten nested fields in record"""
flat = record.copy()
if isinstance(flat.get("detail"), dict):
detail = flat.pop("detail", {}) or {}
for key, value in detail.items():
if isinstance(value, list):
value = ','.join(map(str, value))
flat[f'detail_{key}'] = value
return flat
def fetch_data(endpoint):
"""Fetch data from a specific endpoint"""
url = f'{BASE_API}/{endpoint}'
params = {
'$orderby': 'id desc',
'$filter': TIME_FILTER,
}
data = []
retries = 0
page_count = 0
logging.info(f"Fetching data from /{endpoint}")
while url:
try:
logging.info(f"Requesting page {page_count + 1}: {url}")
response = requests.get(url, headers=headers, params=params if url.endswith(endpoint) else None)
if response.status_code == 200:
json_resp = response.json()
page_data = json_resp.get('value', [])
logging.info(f"Retrieved {len(page_data)} records from page {page_count + 1}")
if not page_data:
break
for record in page_data:
data.append(flatten_record(record))
url = json_resp.get('@odata.nextLink')
retries = 0
page_count += 1
if endpoint == 'users':
time.sleep(2)
elif response.status_code == 429:
wait = 90 if endpoint == 'users' else 60
logging.warning(f"Rate limited on /{endpoint}. Waiting for {wait} seconds.")
time.sleep(wait)
elif response.status_code in (500, 502, 503, 504):
if retries < MAX_RETRIES:
wait = 2 ** retries
logging.warning(f"Transient error {response.status_code} on /{endpoint}. Retrying in {wait} seconds...")
time.sleep(wait)
retries += 1
else:
logging.error(f"Max retries reached on /{endpoint}. Aborting.")
break
else:
logging.error(f"Unexpected error {response.status_code} on /{endpoint}: {response.text}")
break
except Exception as e:
logging.exception(f"Exception occurred while calling /{endpoint}")
break
if data:
df = pd.DataFrame(data)
df = clean_illegal_chars(df)
logging.info(f"Ingestion complete for /{endpoint}, {len(df)} records.")
return df
else:
logging.info(f"No data retrieved from /{endpoint}.")
return pd.DataFrame()
def get_conversation_topics(interactions_df, conversation_id):
"""Get all entities for a conversation ID"""
entities = interactions_df[interactions_df['conversation_id'] == conversation_id]['detail_entity'].dropna().unique()
return ', '.join([str(e) for e in entities if str(e) != 'nan'])
def get_interaction_content(row, interactions_df):
"""Extract interaction content from detail fields"""
interaction_id = row['id']
interaction_row = interactions_df[interactions_df['id'] == interaction_id]
if interaction_row.empty:
return ""
interaction_row = interaction_row.iloc[0]
# Based on API example, only use fields that actually exist
# From the example: detail.content is the main field for user message content
if 'detail_content' in interaction_row and pd.notna(interaction_row['detail_content']) and str(interaction_row['detail_content']) != '':
return str(interaction_row['detail_content'])
return ""
def get_bot_response(row, interactions_df):
"""Get bot response using parent interaction ID"""
interaction_id = row['id']
# Find bot interactions with this interaction as parent
bot_response = interactions_df[
(interactions_df['parent_interaction_id'] == interaction_id) &
(interactions_df['actor'] == 'bot')
]
if not bot_response.empty:
bot_row = bot_response.iloc[0]
# Only use the content field that actually exists in the API
if 'detail_content' in bot_row and pd.notna(bot_row['detail_content']) and str(bot_row['detail_content']) != '':
return str(bot_row['detail_content'])
return ""
def get_plugin_info(interaction_id, plugin_calls_df, plugin_type):
"""Get plugin information based on type (unsuccessful, served, used)"""
if plugin_calls_df.empty:
return ""
interaction_plugins = plugin_calls_df[plugin_calls_df['interaction_id'] == interaction_id]
if interaction_plugins.empty:
return ""
if plugin_type == 'unsuccessful':
# Plugins that were not served and not used
plugins = interaction_plugins[
(interaction_plugins['served'] == False) &
(interaction_plugins['used'] == False)
]['plugin_name'].unique()
elif plugin_type == 'served':
# Plugins that were served
plugins = interaction_plugins[
interaction_plugins['served'] == True
]['plugin_name'].unique()
elif plugin_type == 'used':
# Plugins that were served and used
plugins = interaction_plugins[
(interaction_plugins['served'] == True) &
(interaction_plugins['used'] == True)
]['plugin_name'].unique()
else:
return ""
return ', '.join([str(p) for p in plugins if str(p) != 'nan'])
def get_resource_info(interaction_id, plugin_resources_df, info_type):
"""Get resource information (domain, citations, content items, tickets)"""
if plugin_resources_df.empty:
return "" if info_type != 'citation_count' else 0
interaction_resources = plugin_resources_df[plugin_resources_df['interaction_id'] == interaction_id]
if interaction_resources.empty:
return "" if info_type != 'citation_count' else 0
if info_type == 'domain':
domains = interaction_resources['detail_domain'].dropna().unique()
return ', '.join([str(d) for d in domains if str(d) != 'nan'])
elif info_type == 'citation_count':
unique_resources = interaction_resources['resource_id'].dropna().nunique()
return unique_resources
elif info_type == 'content_item_id':
# Use external_resource_id from detail field
content_ids = interaction_resources['detail_external_resource_id'].dropna().unique()
return ', '.join([str(c) for c in content_ids if str(c) != 'nan'])
elif info_type == 'content_item_name':
content_names = interaction_resources['detail_name'].dropna().unique()
return ', '.join([str(c) for c in content_names if str(c) != 'nan'])
elif info_type == 'ticket_type':
# Need to determine ticket type logic based on your business rules
# This might need adjustment based on how you determine user vs bot initiated
ticket_resources = interaction_resources[interaction_resources['type'] == 'RESOURCE_TYPE_TICKET']
if not ticket_resources.empty:
return "user initiated ticket" # Default, adjust logic as needed
return ""
elif info_type == 'ticket_id':
ticket_resources = interaction_resources[interaction_resources['type'] == 'RESOURCE_TYPE_TICKET']
ticket_ids = ticket_resources['detail_external_resource_id'].dropna().unique()
return ', '.join([str(t) for t in ticket_ids if str(t) != 'nan'])
return ""
def get_interaction_surface(interaction_id, interactions_df):
"""Get the platform/surface where interaction took place"""
interaction_row = interactions_df[interactions_df['id'] == interaction_id]
if not interaction_row.empty:
# Use 'platform' field from the API response
surface = interaction_row.iloc[0].get('platform', '')
return str(surface) if pd.notna(surface) else ""
return ""
def get_user_language(user_id, users_df):
"""Get user's preferred language"""
if users_df.empty:
return ""
# Use the actual field name from API response
user_row = users_df[users_df['id'] == user_id]
if not user_row.empty:
language = user_row.iloc[0].get('user_preferred_language', '')
return str(language) if pd.notna(language) else ""
return ""
def build_raw_interactions_table(all_dataframes):
"""Build the final raw interactions table"""
logging.info("Building raw interactions table...")
# Extract dataframes
interactions_df = all_dataframes.get('interactions', pd.DataFrame())
conversations_df = all_dataframes.get('conversations', pd.DataFrame())
plugin_calls_df = all_dataframes.get('plugin-calls', pd.DataFrame())
plugin_resources_df = all_dataframes.get('plugin-resources', pd.DataFrame())
users_df = all_dataframes.get('users', pd.DataFrame())
if interactions_df.empty:
logging.error("No interactions data available")
return pd.DataFrame()
# Filter for user-led interactions only
user_interactions = interactions_df[interactions_df['actor'] == 'user'].copy()
if user_interactions.empty:
logging.warning("No user interactions found")
return pd.DataFrame()
logging.info(f"Processing {len(user_interactions)} user interactions...")
# Initialize the raw interactions table
raw_interactions = pd.DataFrame()
# Base columns from interactions
raw_interactions['Timestamp'] = user_interactions['created_time']
raw_interactions['Conversation ID'] = user_interactions['conversation_id']
raw_interactions['Interaction ID'] = user_interactions['id']
raw_interactions['Interaction Type'] = user_interactions['type'] # Changed from 'interaction_type'
# Get conversation domain from conversations table
if not conversations_df.empty:
conversation_domain_map = conversations_df.set_index('id')['primary_domain'].to_dict() # Changed from 'domain'
raw_interactions['Conversation Domain'] = raw_interactions['Conversation ID'].map(conversation_domain_map)
else:
raw_interactions['Conversation Domain'] = ""
# Get conversation topics (aggregated for each conversation)
logging.info("Processing conversation entities...")
raw_interactions['Conversation Topic'] = raw_interactions['Conversation ID'].apply(
lambda x: get_conversation_topics(interactions_df, x)
)
# Get interaction content and bot response
logging.info("Processing interaction content and bot responses...")
raw_interactions['Interaction Content'] = user_interactions.apply(
lambda row: get_interaction_content(row, interactions_df), axis=1
)
raw_interactions['Bot Response'] = user_interactions.apply(
lambda row: get_bot_response(row, interactions_df), axis=1
)
# Get plugin information
logging.info("Processing plugin information...")
raw_interactions['Unsuccessful Plugins'] = raw_interactions['Interaction ID'].apply(
lambda x: get_plugin_info(x, plugin_calls_df, 'unsuccessful')
)
raw_interactions['Plugin Served'] = raw_interactions['Interaction ID'].apply(
lambda x: get_plugin_info(x, plugin_calls_df, 'served')
)
raw_interactions['Plugin Used'] = raw_interactions['Interaction ID'].apply(
lambda x: get_plugin_info(x, plugin_calls_df, 'used')
)
# Get resource information
logging.info("Processing resource information...")
raw_interactions['Resource Domain'] = raw_interactions['Interaction ID'].apply(
lambda x: get_resource_info(x, plugin_resources_df, 'domain')
)
raw_interactions['No of Citations'] = raw_interactions['Interaction ID'].apply(
lambda x: get_resource_info(x, plugin_resources_df, 'citation_count')
)
raw_interactions['Content Item Name'] = raw_interactions['Interaction ID'].apply(
lambda x: get_resource_info(x, plugin_resources_df, 'content_item_name')
)
raw_interactions['Content Item ID'] = raw_interactions['Interaction ID'].apply(
lambda x: get_resource_info(x, plugin_resources_df, 'content_item_id')
)
raw_interactions['Ticket Type'] = raw_interactions['Interaction ID'].apply(
lambda x: get_resource_info(x, plugin_resources_df, 'ticket_type')
)
raw_interactions['Ticket ID'] = raw_interactions['Interaction ID'].apply(
lambda x: get_resource_info(x, plugin_resources_df, 'ticket_id')
)
# Get interaction surface
raw_interactions['Interaction Surface'] = raw_interactions['Interaction ID'].apply(
lambda x: get_interaction_surface(x, interactions_df)
)
# Get user information
if 'user_id' in user_interactions.columns:
logging.info("Processing user information...")
raw_interactions['User Preferred Language'] = user_interactions['user_id'].apply(
lambda x: get_user_language(x, users_df)
)
else:
raw_interactions['User Preferred Language'] = ""
# Add placeholder columns for external user data
raw_interactions['User Department'] = "" # To be populated from external identity system
raw_interactions['User Location'] = "" # To be populated from external identity system
raw_interactions['User Country'] = "" # To be populated from external identity system
logging.info(f"Raw interactions table built with {len(raw_interactions)} rows and {len(raw_interactions.columns)} columns")
return raw_interactions
def main():
"""Main function to orchestrate the entire process"""
# Get user input for date range
time_filter, start_date_str, end_date_str = get_user_date_inputs()
logging.info("Starting raw interactions table builder...")
# Step 1: Fetch all data
all_dataframes = {}
for endpoint in ENDPOINTS:
df = fetch_data(endpoint, time_filter)
if not df.empty:
all_dataframes[endpoint] = df
if not all_dataframes:
logging.error("No data retrieved from any endpoint")
return
# Step 2: Build raw interactions table
raw_interactions_table = build_raw_interactions_table(all_dataframes)
if raw_interactions_table.empty:
logging.error("Failed to build raw interactions table")
return
# Step 3: Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Save raw data to Excel (optional)
raw_data_file = Path(f"moveworks_raw_data_{start_date_str}_to_{end_date_str}_{timestamp}.xlsx")
with pd.ExcelWriter(raw_data_file, engine='openpyxl') as writer:
for endpoint, df in all_dataframes.items():
sheet_name = endpoint[:31] # Excel sheet name max limit
df.to_excel(writer, sheet_name=sheet_name, index=False)
logging.info(f"Raw data saved to {raw_data_file}")
# Save final raw interactions table
final_file = Path(f"raw_interactions_table_{start_date_str}_to_{end_date_str}_{timestamp}.xlsx")
raw_interactions_table.to_excel(final_file, index=False, sheet_name='Raw Interactions')
logging.info(f"Final raw interactions table saved to {final_file}")
# Also save as CSV for easier processing
csv_file = Path(f"raw_interactions_table_{start_date_str}_to_{end_date_str}_{timestamp}.csv")
raw_interactions_table.to_csv(csv_file, index=False)
logging.info(f"Final raw interactions table saved to {csv_file}")
# Print summary
print("\n" + "="*50)
print("RAW INTERACTIONS TABLE SUMMARY")
print("="*50)
print(f"Total interactions processed: {len(raw_interactions_table)}")
print(f"Date range: {start_date_str} to {end_date_str}")
print(f"Time filter: {time_filter}")
print(f"Columns: {list(raw_interactions_table.columns)}")
print(f"Files generated:")
print(f" - {raw_data_file} (raw data)")
print(f" - {final_file} (final table - Excel)")
print(f" - {csv_file} (final table - CSV)")
# Show sample data
if len(raw_interactions_table) > 0:
print(f"\nSample data (first 3 rows):")
print(raw_interactions_table.head(3).to_string())
if __name__ == "__main__":
main()
Updated about 19 hours ago