Merge 1f8e215ed52ea01503fe7de5efc1f16287977173 into c0d2f624c09dc18e709e37c2ad90c039a4eb72a2

This commit is contained in:
Arya vaghayenegar 2025-11-23 08:57:03 +00:00 committed by GitHub
commit ac2691c67f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
47 changed files with 6040 additions and 4 deletions

28
.railwayignore Normal file
View File

@ -0,0 +1,28 @@
# Railway ignore file - prevent large files from being included in build
node_modules/.bin
node_modules/.cache
node_modules/.vite
.next
.nuxt
dist
build
*.log
.DS_Store
.env.local
.venv
venv
__pycache__
*.pyc
.pytest_cache
.coverage
.git
.github
notebooks
tests
data
*.png
*.svg
.flake8
.pre-commit-config.yaml
CHANGELOG.md
model-card.md

305
QUICKSTART.md Normal file
View File

@ -0,0 +1,305 @@
# Farsi Transcriber - Quick Start Guide
You now have **TWO** complete applications for Farsi transcription:
## 🖥️ Option 1: Desktop App (PyQt6)
**Location:** `/home/user/whisper/farsi_transcriber/`
### Setup
```bash
cd farsi_transcriber
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python main.py
```
**Features:**
- ✅ Standalone desktop application
- ✅ Works completely offline
- ✅ Direct access to file system
- ✅ Lightweight and fast
- ⚠️ Simpler UI (green theme)
**Good for:**
- Local-only transcription
- Users who prefer desktop apps
- Offline processing
---
## 🌐 Option 2: Web App (React + Flask)
**Location:** `/home/user/whisper/farsi_transcriber_web/`
### Setup
**Backend (Flask):**
```bash
cd farsi_transcriber_web/backend
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
python app.py
# API runs on http://localhost:5000
```
**Frontend (React):**
```bash
cd farsi_transcriber_web
npm install
npm run dev
# App runs on http://localhost:3000
```
**Features:**
- ✅ Modern web-based UI (matches your Figma design exactly)
- ✅ File queue management
- ✅ Dark/Light theme toggle
- ✅ Search with text highlighting
- ✅ Copy segments to clipboard
- ✅ Resizable window
- ✅ RTL support for Farsi
- ✅ Multiple export formats
- ✅ Professional styling
**Good for:**
- Modern web experience
- Team collaboration (can be deployed online)
- More features and polish
- Professional appearance
---
## 📊 Comparison
| Feature | Desktop (PyQt6) | Web (React) |
|---------|-----------------|------------|
| **Interface** | Simple, green | Modern, professional |
| **Dark Mode** | ❌ | ✅ |
| **File Queue** | ❌ | ✅ |
| **Search** | ❌ | ✅ |
| **Copy Segments** | ❌ | ✅ |
| **Resizable Window** | ❌ | ✅ |
| **Export Formats** | SRT, TXT, VTT, JSON, TSV | TXT, SRT, VTT, JSON |
| **Offline** | ✅ | Requires backend |
| **Easy Setup** | ✅✅ | ✅ (2 terminals) |
| **Deployment** | Desktop only | Can host online |
| **Code Size** | ~25KB | ~200KB |
---
## 🚀 Which Should You Use?
### Use **Desktop App** if:
- You want simple, quick setup
- You never share transcriptions
- You prefer offline processing
- You don't need advanced features
### Use **Web App** if:
- You like modern interfaces
- You want dark/light themes
- You need file queue management
- You want to potentially share online
- You want professional appearance
---
## 📁 Project Structure
```
whisper/
├── farsi_transcriber/ (Desktop PyQt6 App)
│ ├── ui/
│ ├── models/
│ ├── utils/
│ ├── config.py
│ ├── main.py
│ └── requirements.txt
└── farsi_transcriber_web/ (Web React App)
├── src/
│ ├── App.tsx
│ ├── components/
│ └── main.tsx
├── backend/
│ ├── app.py
│ └── requirements.txt
├── package.json
└── vite.config.ts
```
---
## 🔧 System Requirements
### Desktop App
- Python 3.8+
- ffmpeg
- 4GB RAM
### Web App
- Python 3.8+ (backend)
- Node.js 16+ (frontend)
- ffmpeg
- 4GB RAM
---
## 📝 Setup Checklist
### Initial Setup (One-time)
- [ ] Install ffmpeg
```bash
# Ubuntu/Debian
sudo apt install ffmpeg
# macOS
brew install ffmpeg
# Windows
choco install ffmpeg
```
- [ ] Verify Python 3.8+
```bash
python3 --version
```
- [ ] Verify Node.js 16+ (for web app only)
```bash
node --version
```
### Desktop App Setup
- [ ] Create virtual environment
- [ ] Install requirements
- [ ] Run app
### Web App Setup
**Backend:**
- [ ] Create virtual environment
- [ ] Install requirements
- [ ] Run Flask server
**Frontend:**
- [ ] Install Node dependencies
- [ ] Run dev server
---
## 🎯 Quick Start (Fastest)
### Desktop (30 seconds)
```bash
cd whisper/farsi_transcriber
python3 -m venv venv && source venv/bin/activate
pip install -r requirements.txt && python main.py
```
### Web (2 minutes)
Terminal 1:
```bash
cd whisper/farsi_transcriber_web/backend
python3 -m venv venv && source venv/bin/activate
pip install -r requirements.txt && python app.py
```
Terminal 2:
```bash
cd whisper/farsi_transcriber_web
npm install && npm run dev
```
---
## 🐛 Troubleshooting
### "ffmpeg not found"
Install ffmpeg (see requirements above)
### "ModuleNotFoundError" (Python)
```bash
# Ensure virtual environment is activated
source venv/bin/activate # Linux/Mac
# or
venv\Scripts\activate # Windows
```
### "npm: command not found"
Install Node.js from https://nodejs.org
### App runs slow
- Use GPU: Install CUDA
- Reduce model size: change to 'small' or 'tiny'
- Close other applications
---
## 📚 Full Documentation
- **Desktop App:** `farsi_transcriber/README.md`
- **Web App:** `farsi_transcriber_web/README.md`
- **API Docs:** `farsi_transcriber_web/README.md` (Endpoints section)
---
## 🎓 What Was Built
### Desktop Application (PyQt6)
✅ File picker for audio/video
✅ Whisper integration with word-level timestamps
✅ 5 export formats (TXT, SRT, VTT, JSON, TSV)
✅ Professional styling
✅ Progress indicators
✅ Threading to prevent UI freezing
### Web Application (React + Flask)
✅ Complete Figma design implementation
✅ File queue management
✅ Dark/light theme
✅ Search with highlighting
✅ Segment management
✅ Resizable window
✅ RTL support
✅ Flask backend with Whisper integration
✅ 4 export formats
✅ Real file upload handling
---
## 🚀 Next Steps
1. **Choose your app** (Desktop or Web)
2. **Install ffmpeg** if not already installed
3. **Follow the setup instructions** above
4. **Test with a Farsi audio file**
5. **Export in your preferred format**
---
## 💡 Tips
- **First transcription is slow** (downloads 769MB model)
- **Use larger models** (medium/large) for better accuracy
- **Use smaller models** (tiny/base) for speed
- **GPU significantly speeds up** transcription
- **Both apps work offline** (after initial model download)
---
## 📧 Need Help?
- Check the full README in each app's directory
- Verify all requirements are installed
- Check browser console (web app) or Python output (desktop)
- Ensure ffmpeg is in your PATH
---
**Enjoy your Farsi transcription apps!** 🎉

114
RAILWAY_QUICKSTART.md Normal file
View File

@ -0,0 +1,114 @@
# Railway Deployment - Quick Start (5 Minutes)
Deploy your Farsi Transcriber to Railway in just 5 minutes! 🚀
---
## **What You'll Get**
✅ Your app live online
✅ Free $5/month credit
✅ 24/7 uptime
✅ Automatic scaling
✅ No credit card needed (free tier)
---
## **Step 1: Create Railway Account** (2 min)
1. Go to **https://railway.app**
2. Click **"Login with GitHub"**
3. Authorize with your GitHub account
4. Done! You get $5 free credit ✅
---
## **Step 2: Create Backend Service** (2 min)
1. Click **"Create New Project"**
2. Select **"GitHub Repo"**
3. Find your **whisper** fork
4. Railway auto-detects Python project
5. **Root Directory:** `farsi_transcriber_web/backend`
6. Click **Deploy**
7. **Wait 2-3 minutes** for deployment
8. **Copy the URL** that appears (e.g., `https://farsi-api-xxx.railway.app`)
---
## **Step 3: Create Frontend Service** (1 min)
1. In Railway project, click **"New Service"** → **"GitHub Repo"**
2. Select **whisper** again
3. **Root Directory:** `farsi_transcriber_web`
4. Click **Deploy**
5. **Wait 3-5 minutes** for build and deployment
---
## **Step 4: Connect Frontend to Backend** (Bonus step - 1 min)
1. In Railway, select **frontend** service
2. Go to **Variables**
3. Edit `VITE_API_URL` and paste your backend URL from Step 2
4. Click **Deploy** to redeploy with correct API URL
---
## **That's It! 🎉**
Your app is now live! Click the frontend service to see your live URL.
Example URLs:
- Frontend: `https://farsi-transcriber-prod.railway.app`
- Backend: `https://farsi-api-prod.railway.app`
---
## **Test Your App**
1. Click your frontend URL
2. Add a file
3. Click Transcribe
4. Wait for transcription
5. Export results
---
## **Detailed Guide**
For more details, see: `farsi_transcriber_web/RAILWAY_DEPLOYMENT.md`
---
## **Cost**
- **First 3 months:** FREE ($5/month credit)
- **After that:** ~$2-3/month for personal use
- Can upgrade to paid tier for more resources
---
## **Common Issues**
**"API connection failed"**
- Make sure backend URL is correct in frontend variables
- Redeploy frontend after updating API URL
**"Model not loaded"**
- Wait 1-2 minutes on first transcription
- Model downloads on first use
**"Build failed"**
- Check Railway logs for errors
- Ensure all files are committed
---
## **Support**
For detailed setup help, see: `farsi_transcriber_web/RAILWAY_DEPLOYMENT.md`
---
**Your Farsi Transcriber is now online!** Share the URL with anyone! 🌐

52
farsi_transcriber/.gitignore vendored Normal file
View File

@ -0,0 +1,52 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
ENV/
env/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
# PyTorch/ML Models
*.pt
*.pth
models/downloaded/
# Whisper models cache
~/.cache/whisper/
# Application outputs
transcriptions/
exports/
*.log
# Testing
.pytest_cache/
.coverage
htmlcov/

257
farsi_transcriber/README.md Normal file
View File

@ -0,0 +1,257 @@
# Farsi Transcriber
A professional desktop application for transcribing Farsi audio and video files using OpenAI's Whisper model.
## Features
✨ **Core Features**
- 🎙️ Transcribe audio files (MP3, WAV, M4A, FLAC, OGG, AAC, WMA)
- 🎬 Extract audio from video files (MP4, MKV, MOV, WebM, AVI, FLV, WMV)
- 🇮🇷 High-accuracy Farsi/Persian language transcription
- ⏱️ Word-level timestamps for precise timing
- 📤 Export to multiple formats (TXT, SRT, VTT, JSON, TSV)
- 💻 Clean, intuitive PyQt6-based GUI
- 🚀 GPU acceleration support (CUDA) with automatic fallback to CPU
- 🔄 Progress indicators and real-time status updates
## System Requirements
**Minimum:**
- Python 3.8 or higher
- 4GB RAM
- ffmpeg installed
**Recommended:**
- Python 3.10+
- 8GB+ RAM
- NVIDIA GPU with CUDA support (optional but faster)
- SSD for better performance
## Installation
### Step 1: Install ffmpeg
Choose your operating system:
**Ubuntu/Debian:**
```bash
sudo apt update && sudo apt install ffmpeg
```
**Fedora/CentOS:**
```bash
sudo dnf install ffmpeg
```
**macOS (Homebrew):**
```bash
brew install ffmpeg
```
**Windows (Chocolatey):**
```bash
choco install ffmpeg
```
**Windows (Scoop):**
```bash
scoop install ffmpeg
```
### Step 2: Set up Python environment
```bash
# Navigate to the repository
cd whisper/farsi_transcriber
# Create virtual environment
python3 -m venv venv
# Activate virtual environment
source venv/bin/activate # On Windows: venv\Scripts\activate
```
### Step 3: Install dependencies
```bash
pip install -r requirements.txt
```
This will install:
- PyQt6 (GUI framework)
- openai-whisper (transcription engine)
- PyTorch (deep learning framework)
- NumPy, tiktoken, tqdm (supporting libraries)
## Usage
### Running the Application
```bash
python main.py
```
### Step-by-Step Guide
1. **Launch the app** - Run `python main.py`
2. **Select a file** - Click "Select File" button to choose audio/video
3. **Transcribe** - Click "Transcribe" and wait for completion
4. **View results** - See transcription with timestamps
5. **Export** - Click "Export Results" to save in your preferred format
### Supported Export Formats
- **TXT** - Plain text (content only)
- **SRT** - SubRip subtitle format (with timestamps)
- **VTT** - WebVTT subtitle format (with timestamps)
- **JSON** - Structured format with segments and metadata
- **TSV** - Tab-separated values (spreadsheet compatible)
## Configuration
Edit `config.py` to customize:
```python
# Model size (tiny, base, small, medium, large)
DEFAULT_MODEL = "medium"
# Language code
LANGUAGE_CODE = "fa" # Farsi
# Supported formats
SUPPORTED_AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ...}
SUPPORTED_VIDEO_FORMATS = {".mp4", ".mkv", ".mov", ".webm", ".avi", ...}
```
## Model Information
### Available Models
| Model | Size | Speed | Accuracy | VRAM |
|-------|------|-------|----------|------|
| tiny | 39M | ~10x | Good | ~1GB |
| base | 74M | ~7x | Very Good | ~1GB |
| small | 244M | ~4x | Excellent | ~2GB |
| medium | 769M | ~2x | Excellent | ~5GB |
| large | 1550M | 1x | Best | ~10GB |
**Default**: `medium` (recommended for Farsi)
### Performance Notes
- Larger models provide better accuracy but require more VRAM
- GPU (CUDA) dramatically speeds up transcription (8-10x faster)
- First run downloads the model (~500MB-3GB depending on model size)
- Subsequent runs use cached model files
## Project Structure
```
farsi_transcriber/
├── ui/ # User interface components
│ ├── __init__.py
│ ├── main_window.py # Main application window
│ └── styles.py # Styling and theming
├── models/ # Model management
│ ├── __init__.py
│ └── whisper_transcriber.py # Whisper wrapper
├── utils/ # Utility functions
│ ├── __init__.py
│ └── export.py # Export functionality
├── config.py # Configuration settings
├── main.py # Application entry point
├── __init__.py # Package init
├── requirements.txt # Python dependencies
└── README.md # This file
```
## Troubleshooting
### Issue: "ffmpeg not found"
**Solution**: Install ffmpeg using your package manager (see Installation section)
### Issue: "CUDA out of memory"
**Solution**: Use a smaller model or reduce audio processing in chunks
### Issue: "Model download fails"
**Solution**: Check internet connection, try again. Models are cached in `~/.cache/whisper/`
### Issue: Slow transcription
**Solution**: Ensure CUDA is detected (`nvidia-smi`), or upgrade to a smaller/faster model
## Advanced Usage
### Custom Model Selection
Update `config.py`:
```python
DEFAULT_MODEL = "large" # For maximum accuracy
# or
DEFAULT_MODEL = "tiny" # For fastest processing
```
### Batch Processing (Future)
Script to process multiple files:
```python
from farsi_transcriber.models.whisper_transcriber import FarsiTranscriber
transcriber = FarsiTranscriber(model_name="medium")
for audio_file in audio_files:
result = transcriber.transcribe(audio_file)
# Process results
```
## Performance Tips
1. **Use GPU** - Ensure NVIDIA CUDA is properly installed
2. **Choose appropriate model** - Balance speed vs accuracy
3. **Close other applications** - Free up RAM/VRAM
4. **Use SSD** - Faster model loading and temporary file I/O
5. **Local processing** - All processing happens locally, no cloud uploads
## Development
### Code Style
```bash
# Format code
black farsi_transcriber/
# Check style
flake8 farsi_transcriber/
# Sort imports
isort farsi_transcriber/
```
### Future Features
- [ ] Batch processing
- [ ] Real-time transcription preview
- [ ] Speaker diarization
- [ ] Multi-language support UI
- [ ] Settings dialog
- [ ] Keyboard shortcuts
- [ ] Drag-and-drop support
- [ ] Recent files history
## License
MIT License - Personal use and modifications allowed
## Acknowledgments
Built with:
- [OpenAI Whisper](https://github.com/openai/whisper) - Speech recognition
- [PyQt6](https://www.riverbankcomputing.com/software/pyqt/) - GUI framework
- [PyTorch](https://pytorch.org/) - Deep learning
## Support
For issues or suggestions:
1. Check the troubleshooting section
2. Verify ffmpeg is installed
3. Ensure Python 3.8+ is used
4. Check available disk space
5. Verify CUDA setup (for GPU users)

View File

@ -0,0 +1,8 @@
"""
Farsi Transcriber Application
A desktop application for transcribing Farsi audio and video files using OpenAI's Whisper.
"""
__version__ = "0.1.0"
__author__ = "Personal Project"

View File

@ -0,0 +1,71 @@
"""
Configuration settings for Farsi Transcriber application
Manages model selection, device settings, and other configuration options.
"""
from pathlib import Path
# Application metadata
APP_NAME = "Farsi Transcriber"
APP_VERSION = "0.1.0"
APP_DESCRIPTION = "A desktop application for transcribing Farsi audio and video files"
# Model settings
DEFAULT_MODEL = "medium" # Options: tiny, base, small, medium, large
AVAILABLE_MODELS = ["tiny", "base", "small", "medium", "large"]
MODEL_DESCRIPTIONS = {
"tiny": "Smallest model (39M params) - Fastest, ~1GB VRAM required",
"base": "Small model (74M params) - Fast, ~1GB VRAM required",
"small": "Medium model (244M params) - Balanced, ~2GB VRAM required",
"medium": "Large model (769M params) - Good accuracy, ~5GB VRAM required",
"large": "Largest model (1550M params) - Best accuracy, ~10GB VRAM required",
}
# Language settings
LANGUAGE_CODE = "fa" # Farsi/Persian
LANGUAGE_NAME = "Farsi"
# Audio/Video settings
SUPPORTED_AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".wma"}
SUPPORTED_VIDEO_FORMATS = {".mp4", ".mkv", ".mov", ".webm", ".avi", ".flv", ".wmv"}
# UI settings
WINDOW_WIDTH = 900
WINDOW_HEIGHT = 700
WINDOW_MIN_WIDTH = 800
WINDOW_MIN_HEIGHT = 600
# Output settings
OUTPUT_DIR = Path.home() / "FarsiTranscriber" / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
EXPORT_FORMATS = {
"txt": "Plain Text",
"srt": "SRT Subtitles",
"vtt": "WebVTT Subtitles",
"json": "JSON Format",
"tsv": "Tab-Separated Values",
}
# Device settings (auto-detect CUDA if available)
try:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
except ImportError:
DEVICE = "cpu"
# Logging settings
LOG_LEVEL = "INFO"
LOG_FILE = OUTPUT_DIR / "transcriber.log"
def get_model_info(model_name: str) -> str:
"""Get description for a model"""
return MODEL_DESCRIPTIONS.get(model_name, "Unknown model")
def get_supported_formats() -> set:
"""Get all supported audio and video formats"""
return SUPPORTED_AUDIO_FORMATS | SUPPORTED_VIDEO_FORMATS

26
farsi_transcriber/main.py Normal file
View File

@ -0,0 +1,26 @@
#!/usr/bin/env python3
"""
Farsi Transcriber - Main Entry Point
A PyQt6-based desktop application for transcribing Farsi audio and video files.
"""
import sys
from PyQt6.QtWidgets import QApplication
from farsi_transcriber.ui.main_window import MainWindow
def main():
"""Main entry point for the application"""
app = QApplication(sys.argv)
# Create and show main window
window = MainWindow()
window.show()
sys.exit(app.exec())
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
"""Model management for Farsi Transcriber"""

View File

@ -0,0 +1,225 @@
"""
Whisper Transcriber Module
Handles Farsi audio/video transcription using OpenAI's Whisper model.
"""
import warnings
from pathlib import Path
from typing import Dict, List, Optional
import torch
import whisper
class FarsiTranscriber:
"""
Wrapper around Whisper model for Farsi transcription.
Supports both audio and video files, with word-level timestamp extraction.
"""
# Supported audio formats
AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".flac", ".ogg", ".aac", ".wma"}
# Supported video formats
VIDEO_FORMATS = {".mp4", ".mkv", ".mov", ".webm", ".avi", ".flv", ".wmv"}
# Language code for Farsi/Persian
FARSI_LANGUAGE = "fa"
def __init__(self, model_name: str = "medium", device: Optional[str] = None):
"""
Initialize Farsi Transcriber.
Args:
model_name: Whisper model size ('tiny', 'base', 'small', 'medium', 'large')
device: Device to use ('cuda', 'cpu'). Auto-detect if None.
"""
self.model_name = model_name
# Auto-detect device
if device is None:
self.device = "cuda" if torch.cuda.is_available() else "cpu"
else:
self.device = device
print(f"Using device: {self.device}")
# Load model
print(f"Loading Whisper model: {model_name}...")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
self.model = whisper.load_model(model_name, device=self.device)
print("Model loaded successfully")
def transcribe(
self,
file_path: str,
language: str = FARSI_LANGUAGE,
verbose: bool = False,
) -> Dict:
"""
Transcribe an audio or video file in Farsi.
Args:
file_path: Path to audio or video file
language: Language code (default: 'fa' for Farsi)
verbose: Whether to print progress
Returns:
Dictionary with transcription results including word-level segments
"""
file_path = Path(file_path)
# Validate file exists
if not file_path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
# Check format is supported
if not self._is_supported_format(file_path):
raise ValueError(
f"Unsupported format: {file_path.suffix}. "
f"Supported: {self.AUDIO_FORMATS | self.VIDEO_FORMATS}"
)
# Perform transcription
print(f"Transcribing: {file_path.name}")
result = self.model.transcribe(
str(file_path),
language=language,
verbose=verbose,
)
# Enhance result with word-level segments
enhanced_result = self._enhance_with_word_segments(result)
return enhanced_result
def _is_supported_format(self, file_path: Path) -> bool:
"""Check if file format is supported."""
suffix = file_path.suffix.lower()
return suffix in (self.AUDIO_FORMATS | self.VIDEO_FORMATS)
def _enhance_with_word_segments(self, result: Dict) -> Dict:
"""
Enhance transcription result with word-level timing information.
Args:
result: Whisper transcription result
Returns:
Enhanced result with word-level segments
"""
enhanced_segments = []
for segment in result.get("segments", []):
# Extract word-level timing if available
word_segments = self._extract_word_segments(segment)
enhanced_segment = {
"id": segment.get("id"),
"start": segment.get("start"),
"end": segment.get("end"),
"text": segment.get("text", ""),
"words": word_segments,
}
enhanced_segments.append(enhanced_segment)
result["segments"] = enhanced_segments
return result
def _extract_word_segments(self, segment: Dict) -> List[Dict]:
"""
Extract word-level timing from a segment.
Args:
segment: Whisper segment with text
Returns:
List of word dictionaries with timing information
"""
text = segment.get("text", "").strip()
if not text:
return []
# For now, return simple word list
# Whisper v3 includes word-level details in some configurations
start_time = segment.get("start", 0)
end_time = segment.get("end", 0)
duration = end_time - start_time
words = text.split()
if not words:
return []
# Distribute time evenly across words (simple approach)
# More sophisticated timing can be extracted from Whisper's internal data
word_duration = duration / len(words) if words else 0
word_segments = []
for i, word in enumerate(words):
word_start = start_time + (i * word_duration)
word_end = word_start + word_duration
word_segments.append(
{
"word": word,
"start": word_start,
"end": word_end,
}
)
return word_segments
def format_result_for_display(
self, result: Dict, include_timestamps: bool = True
) -> str:
"""
Format transcription result for display in UI.
Args:
result: Transcription result
include_timestamps: Whether to include timestamps
Returns:
Formatted text string
"""
lines = []
for segment in result.get("segments", []):
text = segment.get("text", "").strip()
if not text:
continue
if include_timestamps:
start = segment.get("start", 0)
end = segment.get("end", 0)
timestamp = f"[{self._format_time(start)} - {self._format_time(end)}]"
lines.append(f"{timestamp}\n{text}\n")
else:
lines.append(text)
return "\n".join(lines)
@staticmethod
def _format_time(seconds: float) -> str:
"""Format seconds to HH:MM:SS format."""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
milliseconds = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"
def get_device_info(self) -> str:
"""Get information about current device and model."""
return (
f"Model: {self.model_name} | "
f"Device: {self.device.upper()} | "
f"VRAM: {torch.cuda.get_device_properties(self.device).total_memory / 1e9:.1f}GB "
if self.device == "cuda"
else f"Model: {self.model_name} | Device: {self.device.upper()}"
)

View File

@ -0,0 +1,7 @@
PyQt6==6.6.1
PyQt6-Qt6==6.6.1
PyQt6-sip==13.6.0
torch>=1.10.1
numpy
openai-whisper
tqdm

View File

@ -0,0 +1 @@
"""UI components for Farsi Transcriber"""

View File

@ -0,0 +1,285 @@
"""
Main application window for Farsi Transcriber
Provides PyQt6-based GUI for selecting files and transcribing Farsi audio/video.
"""
import os
from pathlib import Path
from PyQt6.QtCore import Qt, QThread, pyqtSignal
from PyQt6.QtWidgets import (
QMainWindow,
QWidget,
QVBoxLayout,
QHBoxLayout,
QPushButton,
QLabel,
QTextEdit,
QProgressBar,
QFileDialog,
QMessageBox,
)
from PyQt6.QtGui import QFont
from farsi_transcriber.models.whisper_transcriber import FarsiTranscriber
from farsi_transcriber.utils.export import TranscriptionExporter
from farsi_transcriber.ui.styles import get_stylesheet, get_color
class TranscriptionWorker(QThread):
"""Worker thread for transcription to prevent UI freezing"""
# Signals
progress_update = pyqtSignal(str) # Status messages
transcription_complete = pyqtSignal(dict) # Results with timestamps
error_occurred = pyqtSignal(str) # Error messages
def __init__(self, file_path: str, model_name: str = "medium"):
super().__init__()
self.file_path = file_path
self.model_name = model_name
self.transcriber = None
def run(self):
"""Run transcription in background thread"""
try:
# Initialize Whisper transcriber
self.progress_update.emit("Loading Whisper model...")
self.transcriber = FarsiTranscriber(model_name=self.model_name)
# Perform transcription
self.progress_update.emit(f"Transcribing: {Path(self.file_path).name}")
result = self.transcriber.transcribe(self.file_path)
# Format result for display with timestamps
display_text = self.transcriber.format_result_for_display(result)
# Add full text for export
result["full_text"] = result.get("text", "")
self.progress_update.emit("Transcription complete!")
self.transcription_complete.emit(
{
"text": display_text,
"segments": result.get("segments", []),
"full_text": result.get("text", ""),
}
)
except Exception as e:
self.error_occurred.emit(f"Error: {str(e)}")
class MainWindow(QMainWindow):
"""Main application window for Farsi Transcriber"""
# Supported audio and video formats
SUPPORTED_FORMATS = (
"Audio Files (*.mp3 *.wav *.m4a *.flac *.ogg *.aac *.wma);;",
"Video Files (*.mp4 *.mkv *.mov *.webm *.avi *.flv *.wmv);;",
"All Files (*.*)",
)
def __init__(self):
super().__init__()
self.selected_file = None
self.transcription_worker = None
self.last_result = None
# Apply stylesheet
self.setStyleSheet(get_stylesheet())
self.init_ui()
def init_ui(self):
"""Initialize the user interface"""
self.setWindowTitle("Farsi Transcriber")
self.setGeometry(100, 100, 900, 700)
# Create central widget and main layout
central_widget = QWidget()
self.setCentralWidget(central_widget)
main_layout = QVBoxLayout(central_widget)
main_layout.setSpacing(10)
main_layout.setContentsMargins(20, 20, 20, 20)
# Title
title_label = QLabel("Farsi Audio/Video Transcriber")
title_font = QFont()
title_font.setPointSize(16)
title_font.setBold(True)
title_label.setFont(title_font)
main_layout.addWidget(title_label)
# File selection section
file_section_layout = QHBoxLayout()
self.file_label = QLabel("No file selected")
self.file_label.setStyleSheet("color: gray;")
file_section_layout.addWidget(self.file_label, 1)
self.select_button = QPushButton("Select File")
self.select_button.clicked.connect(self.on_select_file)
file_section_layout.addWidget(self.select_button)
main_layout.addLayout(file_section_layout)
# Transcribe button
self.transcribe_button = QPushButton("Transcribe")
self.transcribe_button.clicked.connect(self.on_transcribe)
self.transcribe_button.setEnabled(False)
main_layout.addWidget(self.transcribe_button)
# Progress bar
self.progress_bar = QProgressBar()
self.progress_bar.setRange(0, 0) # Indeterminate progress
self.progress_bar.setVisible(False)
main_layout.addWidget(self.progress_bar)
# Status label
self.status_label = QLabel("Ready")
self.status_label.setStyleSheet("color: #666; font-style: italic;")
main_layout.addWidget(self.status_label)
# Results text area
results_title = QLabel("Transcription Results:")
results_title_font = QFont()
results_title_font.setBold(True)
results_title.setFont(results_title_font)
main_layout.addWidget(results_title)
self.results_text = QTextEdit()
self.results_text.setReadOnly(True)
self.results_text.setPlaceholderText(
"Transcription results will appear here..."
)
# Set monospace font for results
mono_font = QFont("Courier New", 10)
self.results_text.setFont(mono_font)
main_layout.addWidget(self.results_text)
# Buttons layout (Export, Clear)
buttons_layout = QHBoxLayout()
buttons_layout.addStretch()
self.export_button = QPushButton("Export Results")
self.export_button.clicked.connect(self.on_export)
self.export_button.setEnabled(False)
buttons_layout.addWidget(self.export_button)
self.clear_button = QPushButton("Clear")
self.clear_button.clicked.connect(self.on_clear)
buttons_layout.addWidget(self.clear_button)
main_layout.addLayout(buttons_layout)
def on_select_file(self):
"""Handle file selection dialog"""
file_path, _ = QFileDialog.getOpenFileName(
self, "Select Audio or Video File", "", "".join(self.SUPPORTED_FORMATS)
)
if file_path:
self.selected_file = file_path
file_name = Path(file_path).name
self.file_label.setText(f"Selected: {file_name}")
self.file_label.setStyleSheet("color: #333;")
self.transcribe_button.setEnabled(True)
self.export_button.setEnabled(False)
self.results_text.clear()
self.status_label.setText("File selected. Click 'Transcribe' to start.")
def on_transcribe(self):
"""Handle transcription button click"""
if not self.selected_file:
QMessageBox.warning(self, "Error", "Please select a file first.")
return
# Disable buttons during transcription
self.transcribe_button.setEnabled(False)
self.select_button.setEnabled(False)
self.export_button.setEnabled(False)
# Show progress
self.progress_bar.setVisible(True)
self.status_label.setText("Transcribing...")
# Create and start worker thread
self.transcription_worker = TranscriptionWorker(self.selected_file)
self.transcription_worker.progress_update.connect(self.on_progress_update)
self.transcription_worker.transcription_complete.connect(
self.on_transcription_complete
)
self.transcription_worker.error_occurred.connect(self.on_error)
self.transcription_worker.start()
def on_progress_update(self, message: str):
"""Handle progress updates from worker thread"""
self.status_label.setText(message)
def on_transcription_complete(self, result: dict):
"""Handle completed transcription"""
self.progress_bar.setVisible(False)
self.transcribe_button.setEnabled(True)
self.select_button.setEnabled(True)
self.export_button.setEnabled(True)
self.status_label.setText("Transcription complete!")
# Display results with timestamps
self.results_text.setText(result.get("text", "No transcription available"))
# Store result for export
self.last_result = result
def on_error(self, error_message: str):
"""Handle errors from worker thread"""
self.progress_bar.setVisible(False)
self.transcribe_button.setEnabled(True)
self.select_button.setEnabled(True)
self.status_label.setText("Error occurred. Check message below.")
QMessageBox.critical(self, "Transcription Error", error_message)
def on_export(self):
"""Handle export button click"""
if not self.last_result:
QMessageBox.warning(self, "Warning", "No transcription to export.")
return
file_path, file_filter = QFileDialog.getSaveFileName(
self,
"Export Transcription",
"",
"Text Files (*.txt);;SRT Subtitles (*.srt);;WebVTT Subtitles (*.vtt);;JSON (*.json);;TSV (*.tsv)",
)
if file_path:
try:
file_path = Path(file_path)
# Determine format from file extension
suffix = file_path.suffix.lower().lstrip(".")
if not suffix:
# Default to txt if no extension
suffix = "txt"
file_path = file_path.with_suffix(".txt")
# Export using the appropriate format
TranscriptionExporter.export(self.last_result, file_path, suffix)
QMessageBox.information(
self,
"Success",
f"Transcription exported successfully to:\n{file_path.name}",
)
except Exception as e:
QMessageBox.critical(
self, "Export Error", f"Failed to export: {str(e)}"
)
def on_clear(self):
"""Clear all results and reset UI"""
self.selected_file = None
self.file_label.setText("No file selected")
self.file_label.setStyleSheet("color: gray;")
self.results_text.clear()
self.status_label.setText("Ready")
self.transcribe_button.setEnabled(False)
self.export_button.setEnabled(False)

View File

@ -0,0 +1,107 @@
"""
Application styling and theming
Provides stylesheet and styling utilities for the Farsi Transcriber app.
"""
# Modern, professional dark-themed stylesheet
MAIN_STYLESHEET = """
QMainWindow {
background-color: #f5f5f5;
}
QLabel {
color: #333333;
}
QLineEdit, QTextEdit {
background-color: #ffffff;
color: #333333;
border: 1px solid #d0d0d0;
border-radius: 4px;
padding: 5px;
font-size: 11pt;
}
QLineEdit:focus, QTextEdit:focus {
border: 2px solid #4CAF50;
background-color: #fafafa;
}
QPushButton {
background-color: #4CAF50;
color: white;
border: none;
border-radius: 4px;
padding: 8px 16px;
font-weight: bold;
font-size: 11pt;
min-height: 32px;
}
QPushButton:hover {
background-color: #45a049;
}
QPushButton:pressed {
background-color: #3d8b40;
}
QPushButton:disabled {
background-color: #cccccc;
color: #999999;
}
QProgressBar {
border: 1px solid #d0d0d0;
border-radius: 4px;
text-align: center;
background-color: #ffffff;
height: 20px;
}
QProgressBar::chunk {
background-color: #4CAF50;
border-radius: 3px;
}
QMessageBox QLabel {
color: #333333;
}
QMessageBox QPushButton {
min-width: 60px;
}
"""
# Color palette
COLORS = {
"primary": "#4CAF50",
"primary_hover": "#45a049",
"primary_active": "#3d8b40",
"background": "#f5f5f5",
"text": "#333333",
"text_secondary": "#666666",
"border": "#d0d0d0",
"success": "#4CAF50",
"error": "#f44336",
"warning": "#ff9800",
"info": "#2196F3",
}
# Font settings
FONTS = {
"default_size": 11,
"title_size": 16,
"mono_family": "Courier New",
}
def get_stylesheet() -> str:
"""Get the main stylesheet for the application"""
return MAIN_STYLESHEET
def get_color(color_name: str) -> str:
"""Get a color from the palette"""
return COLORS.get(color_name, "#000000")

View File

@ -0,0 +1 @@
"""Utility functions for Farsi Transcriber"""

View File

@ -0,0 +1,164 @@
"""
Export utilities for transcription results
Supports multiple export formats: TXT, SRT, JSON, TSV, VTT
"""
import json
from datetime import timedelta
from pathlib import Path
from typing import Dict, List
class TranscriptionExporter:
"""Export transcription results in various formats"""
@staticmethod
def export_txt(result: Dict, file_path: Path) -> None:
"""
Export transcription as plain text file.
Args:
result: Transcription result dictionary
file_path: Output file path
"""
text = result.get("full_text", "") or result.get("text", "")
with open(file_path, "w", encoding="utf-8") as f:
f.write(text)
@staticmethod
def export_srt(result: Dict, file_path: Path) -> None:
"""
Export transcription as SRT subtitle file.
Args:
result: Transcription result dictionary
file_path: Output file path
"""
segments = result.get("segments", [])
with open(file_path, "w", encoding="utf-8") as f:
for i, segment in enumerate(segments, 1):
start = TranscriptionExporter._format_srt_time(segment.get("start", 0))
end = TranscriptionExporter._format_srt_time(segment.get("end", 0))
text = segment.get("text", "").strip()
if text:
f.write(f"{i}\n")
f.write(f"{start} --> {end}\n")
f.write(f"{text}\n\n")
@staticmethod
def export_vtt(result: Dict, file_path: Path) -> None:
"""
Export transcription as WebVTT subtitle file.
Args:
result: Transcription result dictionary
file_path: Output file path
"""
segments = result.get("segments", [])
with open(file_path, "w", encoding="utf-8") as f:
f.write("WEBVTT\n\n")
for segment in segments:
start = TranscriptionExporter._format_vtt_time(segment.get("start", 0))
end = TranscriptionExporter._format_vtt_time(segment.get("end", 0))
text = segment.get("text", "").strip()
if text:
f.write(f"{start} --> {end}\n")
f.write(f"{text}\n\n")
@staticmethod
def export_json(result: Dict, file_path: Path) -> None:
"""
Export transcription as JSON file.
Args:
result: Transcription result dictionary
file_path: Output file path
"""
with open(file_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
@staticmethod
def export_tsv(result: Dict, file_path: Path) -> None:
"""
Export transcription as TSV (tab-separated values) file.
Args:
result: Transcription result dictionary
file_path: Output file path
"""
segments = result.get("segments", [])
with open(file_path, "w", encoding="utf-8") as f:
# Write header
f.write("Index\tStart\tEnd\tDuration\tText\n")
for i, segment in enumerate(segments, 1):
start = segment.get("start", 0)
end = segment.get("end", 0)
duration = end - start
text = segment.get("text", "").strip()
if text:
f.write(
f"{i}\t{start:.2f}\t{end:.2f}\t{duration:.2f}\t{text}\n"
)
@staticmethod
def export(
result: Dict, file_path: Path, format_type: str = "txt"
) -> None:
"""
Export transcription in specified format.
Args:
result: Transcription result dictionary
file_path: Output file path
format_type: Export format ('txt', 'srt', 'vtt', 'json', 'tsv')
Raises:
ValueError: If format is not supported
"""
format_type = format_type.lower()
exporters = {
"txt": TranscriptionExporter.export_txt,
"srt": TranscriptionExporter.export_srt,
"vtt": TranscriptionExporter.export_vtt,
"json": TranscriptionExporter.export_json,
"tsv": TranscriptionExporter.export_tsv,
}
if format_type not in exporters:
raise ValueError(
f"Unsupported format: {format_type}. "
f"Supported formats: {list(exporters.keys())}"
)
exporters[format_type](result, file_path)
@staticmethod
def _format_srt_time(seconds: float) -> str:
"""Format time for SRT format (HH:MM:SS,mmm)"""
td = timedelta(seconds=seconds)
hours, remainder = divmod(int(td.total_seconds()), 3600)
minutes, secs = divmod(remainder, 60)
milliseconds = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
@staticmethod
def _format_vtt_time(seconds: float) -> str:
"""Format time for VTT format (HH:MM:SS.mmm)"""
td = timedelta(seconds=seconds)
hours, remainder = divmod(int(td.total_seconds()), 3600)
minutes, secs = divmod(remainder, 60)
milliseconds = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{milliseconds:03d}"

View File

@ -0,0 +1,13 @@
# Frontend environment variables
# Copy this to .env.local and update with your values
# API URL for the backend
# Local development: http://localhost:5000
# Railway production: https://your-backend-app.railway.app
VITE_API_URL=http://localhost:5000
# Application name
VITE_APP_NAME=Farsi Transcriber
# Max file size (in MB)
VITE_MAX_FILE_SIZE=500

View File

@ -0,0 +1,7 @@
# Production environment variables for Railway deployment
# Set VITE_API_URL in Railway environment variables instead of committing here
# Default fallback - will be overridden by Railway env var
VITE_API_URL=https://your-backend-url.railway.app
VITE_APP_NAME=Farsi Transcriber
VITE_MAX_FILE_SIZE=500

37
farsi_transcriber_web/.gitignore vendored Normal file
View File

@ -0,0 +1,37 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Environment variables
.env
.env.local
.env.*.local
# Editor directories and files
.vscode
.idea
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
# OS
.DS_Store
Thumbs.db
# Build output
*.tgz
*.tsbuildinfo
vite.config.js
vite.config.d.ts

View File

@ -0,0 +1,308 @@
# Railway Deployment Guide
Complete step-by-step guide to deploy your Farsi Transcriber web app to Railway.
## Prerequisites
1. **GitHub Account** - To connect your repository
2. **Railway Account** - Free signup at https://railway.app
3. **Git** - Already have this since you're using it
## Step 1: Prepare Your Code for Deployment
### 1.1 Make sure all files are committed
```bash
cd /home/user/whisper
git status
git add .
git commit -m "Ready for Railway deployment"
```
### 1.2 Push to GitHub
If you haven't already, push your fork to GitHub:
```bash
git push origin claude/review-repo-011CV3PVcA7ZSCTW2YquuMB8
```
---
## Step 2: Create Railway Account
1. Go to **https://railway.app**
2. Click **"Login with GitHub"**
3. Authorize Railway to access your GitHub account
4. You'll get **$5 monthly credit** for free ✅
---
## Step 3: Create Backend Service (Flask API)
### 3.1 Create a new project
1. In Railway dashboard, click **"Create New Project"**
2. Select **"GitHub Repo"**
3. Select your **whisper** repository
4. Railway will auto-detect it as Python
5. Configure:
- **Root Directory:** `farsi_transcriber_web/backend`
- **Start Command:** `gunicorn --workers 2 --bind 0.0.0.0:$PORT app:app`
### 3.2 Set environment variables
In the Railway dashboard for your backend:
1. Go to **Variables**
2. Add:
```
FLASK_ENV=production
FLASK_DEBUG=False
PYTHONUNBUFFERED=1
PORT=5000
```
### 3.3 Deploy
1. Click **Deploy**
2. Wait for deployment (takes 2-3 minutes)
3. Your backend URL will appear (e.g., `https://farsi-api-prod.railway.app`)
4. **Copy this URL** - you'll need it for the frontend
---
## Step 4: Create Frontend Service (React)
### 4.1 Create another service in same project
1. Go to your Railway project
2. Click **"Create New Service"**
3. Select **"GitHub Repo"**
4. Select your **whisper** repository
5. Configure:
- **Root Directory:** `farsi_transcriber_web`
- **Build Command:** `npm install && npm run build`
- **Start Command:** `npm run preview`
### 4.2 Set environment variables
In the Railway dashboard for your frontend:
1. Go to **Variables**
2. Add:
```
VITE_API_URL=https://your-backend-url-here.railway.app
VITE_APP_NAME=Farsi Transcriber
```
(Replace with your actual backend URL from Step 3.3)
### 4.3 Deploy
1. Click **Deploy**
2. Wait for deployment (2-5 minutes depending on npm install)
3. Your frontend URL will appear (e.g., `https://farsi-web-prod.railway.app`)
---
## Step 5: Configure Services to Communicate
### 5.1 Link backend to frontend
1. In Railway dashboard, select frontend service
2. Go to **Variables**
3. Update `VITE_API_URL` with your backend service domain
4. Deploy again
### 5.2 Test the connection
1. Open your frontend URL
2. Try to add a file and transcribe
3. Check browser console for any errors
4. If errors, check Railway logs (click service → Logs)
---
## Step 6: Monitor Your Deployment
### 6.1 View logs
In Railway dashboard:
- Click your service
- Go to **Logs** tab
- See real-time logs as users interact with your app
### 6.2 Check health
```bash
# Check if backend is running
curl https://your-backend-url.railway.app/health
# Should return:
# {"status": "healthy", "model_loaded": true, "environment": "production"}
```
### 6.3 Monitor usage
- Railway dashboard shows RAM, CPU, bandwidth usage
- Your $5 credit should last 1-3 months for personal use
---
## Step 7: Custom Domain (Optional)
If you want a custom domain like `farsi.yourdomain.com`:
1. Buy a domain on GoDaddy, Namecheap, etc.
2. In Railway dashboard → Your app → Settings → Domains
3. Add custom domain
4. Update DNS records at your domain provider
5. Railway will handle SSL certificate automatically
---
## Troubleshooting
### Issue: Backend showing error "Model not loaded"
**Solution:** First transcription loads the 769MB model (takes 1-2 min). Wait and try again.
### Issue: Frontend can't reach backend
**Solution:**
1. Check backend URL is correct in frontend variables
2. Backend must be running (check Railway logs)
3. CORS should be enabled (already configured)
### Issue: Build fails
**Solution:**
1. Check Railway build logs for errors
2. Ensure `package.json` has all required dependencies
3. Run locally first: `npm install && npm run build`
### Issue: App runs slow
**Solution:**
1. You're on free tier with limited resources
2. Upgrade to paid tier ($5/month) for better performance
3. Or wait for model to cache (subsequent transcriptions are fast)
### Issue: Out of memory
**Solution:**
1. Free tier has limited RAM
2. Close unused tabs/apps
3. Use smaller Whisper model (edit backend to use 'small' instead of 'medium')
---
## Next Steps: Custom Domain Setup
Once stable, add your custom domain:
1. Purchase domain
2. Railway → Settings → Domains → Add Domain
3. Update DNS CNAME records
4. Railway auto-generates SSL certificate
---
## Cost Breakdown
### Free Tier ($5/month credit)
- ✅ 500 build minutes/month
- ✅ 100 GB bandwidth/month
- ✅ 6,000 compute unit hours
- ✅ More than enough for personal use
### Your app will cost:
- **Backend (Flask):** ~$1-2/month
- **Frontend (React):** ~$0.50/month
- **Total:** ~$2/month (with free credit covering 2-3 months)
---
## Useful Commands
### Check if Railway CLI is installed
```bash
railway --version
```
### Install Railway CLI
```bash
npm i -g @railway/cli
```
### Deploy from command line
```bash
railway up
```
### View logs
```bash
railway logs
```
---
## What Happens Now
1. ✅ Your app is live on Railway
2. ✅ Free $5 monthly credit
3. ✅ Auto-scaling (if you get traffic)
4. ✅ 24/7 uptime
5. ✅ Automatic SSL/HTTPS
6. ✅ No infrastructure to manage
---
## Monitor Your App
Visit your Railway dashboard regularly to:
- Check resource usage
- View logs
- Update environment variables
- Scale services if needed
- Monitor costs
---
## After Deployment
Your app is now online! Share the URL with friends:
```
https://your-app-name.railway.app
```
---
## Further Reading
- [Railway Documentation](https://docs.railway.app)
- [Railway GitHub Integration](https://docs.railway.app/guides/github)
- [Railway Environment Variables](https://docs.railway.app/develop/variables)
- [Whisper API Docs](https://github.com/openai/whisper)
---
## Support
If you have issues:
1. Check Railway logs (click service → Logs)
2. Check browser console (F12 → Console tab)
3. Visit Railway docs: https://docs.railway.app
4. Check Flask logs for backend errors
---
**Congratulations! Your Farsi Transcriber is now live!** 🎉

View File

@ -0,0 +1,384 @@
# Farsi Transcriber - Web Application
A professional web-based application for transcribing Farsi audio and video files using OpenAI's Whisper model.
## Features
✨ **Core Features**
- 🎙️ Transcribe audio files (MP3, WAV, M4A, FLAC, OGG, AAC, WMA)
- 🎬 Extract audio from video files (MP4, MKV, MOV, WebM, AVI, FLV, WMV)
- 🇮🇷 High-accuracy Farsi/Persian language transcription
- ⏱️ Word-level timestamps for precise timing
- 📤 Export to multiple formats (TXT, SRT, VTT, JSON)
- 💻 Clean, intuitive React-based UI with Figma design
- 🎨 Dark/Light theme toggle
- 🔍 Search and text highlighting in transcriptions
- 📋 File queue management
- 💾 Copy individual transcription segments
- 🚀 GPU acceleration support (CUDA)
- 🎯 Resizable window for flexible workspace
## Tech Stack
**Frontend:**
- React 18+ with TypeScript
- Vite (fast build tool)
- Tailwind CSS v4.0
- Lucide React (icons)
- re-resizable (window resizing)
- Sonner (toast notifications)
**Backend:**
- Flask (Python web framework)
- OpenAI Whisper (speech recognition)
- PyTorch (deep learning)
- Flask-CORS (cross-origin requests)
## System Requirements
**Frontend:**
- Node.js 16+
- npm/yarn/pnpm
**Backend:**
- Python 3.8+
- 4GB RAM minimum
- 8GB+ recommended
- ffmpeg installed
- Optional: NVIDIA GPU with CUDA support
## Installation
### Step 1: Install ffmpeg
Choose your operating system:
**Ubuntu/Debian:**
```bash
sudo apt update && sudo apt install ffmpeg
```
**macOS (Homebrew):**
```bash
brew install ffmpeg
```
**Windows (Chocolatey):**
```bash
choco install ffmpeg
```
### Step 2: Backend Setup
```bash
# Navigate to backend directory
cd backend
# Create virtual environment
python3 -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
# Install dependencies
pip install -r requirements.txt
```
### Step 3: Frontend Setup
```bash
# Navigate to root directory
cd ..
# Install Node dependencies
npm install
# Or use yarn/pnpm
yarn install
# or
pnpm install
```
## Running the Application
### Step 1: Start Backend API
```bash
cd backend
source venv/bin/activate # Activate virtual environment
python app.py
```
The API will be available at `http://localhost:5000`
### Step 2: Start Frontend Dev Server
In a new terminal:
```bash
npm run dev
```
The application will be available at `http://localhost:3000`
## Building for Production
### Frontend Build
```bash
npm run build
```
This creates optimized production build in `dist/` directory.
### Backend Deployment
For production, use a production WSGI server:
```bash
# Install Gunicorn
pip install gunicorn
# Run with Gunicorn
gunicorn -w 4 -b 0.0.0.0:5000 app:app
```
## API Endpoints
### `/health` (GET)
Health check endpoint
**Response:**
```json
{
"status": "healthy",
"model_loaded": true,
"device": "cuda|cpu"
}
```
### `/transcribe` (POST)
Transcribe audio/video file
**Request:**
- `file`: Audio/video file (multipart/form-data)
- `language`: Language code (optional, default: "fa" for Farsi)
**Response:**
```json
{
"status": "success",
"filename": "audio.mp3",
"language": "fa",
"text": "Full transcription text...",
"segments": [
{
"start": "00:00:00.000",
"end": "00:00:05.500",
"text": "سلام دنیا"
}
]
}
```
### `/models` (GET)
Get available Whisper models
**Response:**
```json
{
"available_models": ["tiny", "base", "small", "medium", "large"],
"current_model": "medium",
"description": "..."
}
```
### `/export` (POST)
Export transcription
**Request:**
```json
{
"transcription": "Full text...",
"segments": [...],
"format": "txt|srt|vtt|json"
}
```
**Response:**
```json
{
"status": "success",
"format": "srt",
"content": "...",
"mime_type": "text/plain"
}
```
## Usage Guide
### 1. Add Files to Queue
- Click "Add Files" button in the left sidebar
- Select audio or video files
- Multiple files can be added to the queue
### 2. Transcribe
- Select a file from the queue
- Click "Transcribe" button
- Watch the progress indicator
- Results appear with timestamps
### 3. Search & Copy
- Use the search bar to find specific text
- Matching text is highlighted
- Click copy icon to copy individual segments
### 4. Export Results
- Select export format (TXT, SRT, VTT, JSON)
- Click "Export" button
- File is downloaded or ready to save
### 5. Theme Toggle
- Click sun/moon icon in header
- Switch between light and dark themes
## Project Structure
```
farsi_transcriber_web/
├── src/
│ ├── App.tsx # Main application component
│ ├── main.tsx # React entry point
│ ├── index.css # Global styles
│ └── components/
│ ├── Button.tsx
│ ├── Progress.tsx
│ ├── Input.tsx
│ └── Select.tsx
├── backend/
│ ├── app.py # Flask API server
│ ├── requirements.txt # Python dependencies
│ └── .gitignore
├── public/
├── package.json
├── vite.config.ts
├── tsconfig.json
├── tailwind.config.js
├── postcss.config.js
└── README.md
```
## Configuration
### Environment Variables
Create a `.env.local` file in the root directory:
```
VITE_API_URL=http://localhost:5000
VITE_MAX_FILE_SIZE=500MB
```
### Backend Configuration
Edit `backend/app.py` to customize:
```python
# Change model size
model = whisper.load_model('large') # tiny, base, small, medium, large
# Change upload folder
UPLOAD_FOLDER = '/custom/path'
# Change max file size
MAX_FILE_SIZE = 1024 * 1024 * 1024 # 1GB
```
## Troubleshooting
### Issue: "API connection failed"
**Solution**: Ensure backend is running on `http://localhost:5000`
### Issue: "Whisper model not found"
**Solution**: First run downloads the model (~3GB). Ensure internet connection and disk space.
### Issue: "CUDA out of memory"
**Solution**: Use smaller model or reduce batch size in `backend/app.py`
### Issue: "ffmpeg not found"
**Solution**: Install ffmpeg using your package manager (see Installation section)
### Issue: Port 3000 or 5000 already in use
**Solution**: Change ports in `vite.config.ts` and `backend/app.py`
## Performance Tips
1. **Use GPU** - Ensure NVIDIA CUDA is properly installed
2. **Choose appropriate model** - Balance speed vs accuracy
3. **Close other applications** - Free up RAM/VRAM
4. **Use SSD** - Faster model loading and file I/O
5. **Batch Processing** - Process multiple files sequentially
## Future Enhancements
- [ ] Drag-and-drop file upload
- [ ] Audio playback synchronized with transcription
- [ ] Edit segments inline
- [ ] Keyboard shortcuts
- [ ] Save/load sessions
- [ ] Speaker diarization
- [ ] Confidence scores
- [ ] Custom vocabulary support
## Development
### Code Style
```bash
# Format code (if ESLint configured)
npm run lint
# Build for development
npm run dev
# Build for production
npm run build
```
### Adding Components
New components go in `src/components/` and should:
- Use TypeScript
- Include prop interfaces
- Export as default
- Include JSDoc comments
## Common Issues & Solutions
| Issue | Solution |
|-------|----------|
| Models slow to load | GPU required for fast transcription |
| File not supported | Check file extension is in supported list |
| Transcription has errors | Try larger model (medium/large) |
| Application crashes | Check browser console and Flask logs |
| Export not working | Ensure segments data is complete |
## License
MIT License - Personal use and modifications allowed
## Credits
Built with:
- [OpenAI Whisper](https://github.com/openai/whisper) - Speech recognition
- [React](https://react.dev/) - UI framework
- [Vite](https://vitejs.dev/) - Build tool
- [Tailwind CSS](https://tailwindcss.com/) - Styling
- [Flask](https://flask.palletsprojects.com/) - Backend framework
## Support
For issues:
1. Check the troubleshooting section
2. Verify ffmpeg is installed
3. Check Flask backend logs
4. Review browser console for errors
5. Ensure Python 3.8+ and Node.js 16+ are installed

View File

@ -0,0 +1,21 @@
# Backend environment variables
# Copy this to .env and update with your values
# Flask environment
FLASK_ENV=production
FLASK_DEBUG=False
# Server port (Railway sets PORT automatically)
PORT=5000
# CORS settings
FLASK_CORS_ORIGINS=*
# Whisper model settings
WHISPER_MODEL=medium
# File upload settings
MAX_FILE_SIZE=500000000
# Python path
PYTHONUNBUFFERED=1

View File

@ -0,0 +1,42 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
ENV/
env/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# ML Models
*.pt
*.pth
~/.cache/whisper/
# Uploads
/uploads
/tmp

View File

@ -0,0 +1 @@
web: gunicorn --workers 4 --worker-class sync --bind 0.0.0.0:$PORT app:app

View File

@ -0,0 +1,228 @@
"""
Farsi Transcriber Backend API
Flask API for handling audio/video file transcription using Whisper model.
Configured for Railway deployment with lazy model loading.
"""
import os
import sys
import tempfile
from pathlib import Path
from werkzeug.utils import secure_filename
from flask import Flask, request, jsonify
from flask_cors import CORS
# Prevent model download during build
os.environ['WHISPER_CACHE'] = os.path.expanduser('~/.cache/whisper')
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
app = Flask(__name__)
CORS(app, resources={r"/api/*": {"origins": "*"}})
# Configuration
UPLOAD_FOLDER = tempfile.gettempdir()
ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'aac', 'wma', 'mp4', 'mkv', 'mov', 'webm', 'avi', 'flv', 'wmv'}
MAX_FILE_SIZE = 500 * 1024 * 1024 # 500MB
# Production settings
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE
app.config['ENV'] = os.getenv('FLASK_ENV', 'production')
# Load Whisper model (lazy load - only on first transcription request)
model = None
def load_model():
"""Lazy load Whisper model on first use (not during build)"""
global model
if model is None:
try:
print("⏳ Loading Whisper model for first time...")
print(" This may take 1-2 minutes on first run...")
# Import here to avoid loading during build
import whisper
model = whisper.load_model('medium')
print("✓ Whisper model loaded successfully")
except Exception as e:
print(f"✗ Error loading Whisper model: {e}")
model = None
return model
def allowed_file(filename):
"""Check if file has allowed extension"""
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/', methods=['GET'])
def index():
"""Root endpoint"""
return jsonify({
'message': 'Farsi Transcriber API',
'version': '1.0.0',
'status': 'running'
})
@app.route('/health', methods=['GET'])
def health():
"""Health check endpoint - fast response without loading model"""
return jsonify({
'status': 'healthy',
'model_loaded': model is not None,
'environment': app.config['ENV']
})
@app.route('/transcribe', methods=['POST'])
def transcribe():
"""
Transcribe audio/video file
Request:
- file: Audio/video file
- language: Language code (default: 'fa' for Farsi)
Response:
- transcription results with segments and timestamps
"""
try:
# Load model if not already loaded
whisper_model = load_model()
if not whisper_model:
return jsonify({'error': 'Failed to load Whisper model'}), 500
# Check if file is in request
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
if not allowed_file(file.filename):
return jsonify({'error': 'File type not allowed'}), 400
# Save file
filename = secure_filename(file.filename)
filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
file.save(filepath)
# Get language code from request (default: Farsi)
language = request.form.get('language', 'fa')
# Transcribe
result = whisper_model.transcribe(filepath, language=language, verbose=False)
# Format response
segments = []
for segment in result.get('segments', []):
segments.append({
'start': f"{int(segment['start'] // 3600):02d}:{int((segment['start'] % 3600) // 60):02d}:{int(segment['start'] % 60):02d}.{int((segment['start'] % 1) * 1000):03d}",
'end': f"{int(segment['end'] // 3600):02d}:{int((segment['end'] % 3600) // 60):02d}:{int(segment['end'] % 60):02d}.{int((segment['end'] % 1) * 1000):03d}",
'text': segment['text'].strip(),
})
# Clean up uploaded file
try:
os.remove(filepath)
except:
pass
return jsonify({
'status': 'success',
'filename': filename,
'language': result.get('language', 'unknown'),
'text': result.get('text', ''),
'segments': segments
})
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/models', methods=['GET'])
def get_models():
"""Get available Whisper models"""
return jsonify({
'available_models': ['tiny', 'base', 'small', 'medium', 'large'],
'current_model': 'medium',
'description': 'List of available Whisper models. Larger models are more accurate but slower.'
})
@app.route('/export', methods=['POST'])
def export():
"""
Export transcription in specified format
Request:
- transcription: Full transcription text
- segments: Array of segments with timestamps
- format: Export format (txt, srt, vtt, json)
Response:
- Exported file content
"""
try:
data = request.json
transcription = data.get('transcription', '')
segments = data.get('segments', [])
format_type = data.get('format', 'txt').lower()
if format_type == 'txt':
content = transcription
mime_type = 'text/plain'
elif format_type == 'srt':
content = _format_srt(segments)
mime_type = 'text/plain'
elif format_type == 'vtt':
content = _format_vtt(segments)
mime_type = 'text/plain'
elif format_type == 'json':
import json
content = json.dumps({'text': transcription, 'segments': segments}, ensure_ascii=False, indent=2)
mime_type = 'application/json'
else:
return jsonify({'error': 'Unsupported format'}), 400
return jsonify({
'status': 'success',
'format': format_type,
'content': content,
'mime_type': mime_type
})
except Exception as e:
return jsonify({'error': str(e)}), 500
def _format_srt(segments):
"""Format transcription as SRT subtitle format"""
lines = []
for i, segment in enumerate(segments, 1):
lines.append(str(i))
lines.append(f"{segment['start']} --> {segment['end']}")
lines.append(segment['text'])
lines.append('')
return '\n'.join(lines)
def _format_vtt(segments):
"""Format transcription as WebVTT subtitle format"""
lines = ['WEBVTT', '']
for segment in segments:
lines.append(f"{segment['start']} --> {segment['end']}")
lines.append(segment['text'])
lines.append('')
return '\n'.join(lines)
if __name__ == '__main__':
port = int(os.getenv('PORT', 5000))
debug = os.getenv('FLASK_ENV', 'production') == 'development'
app.run(debug=debug, host='0.0.0.0', port=port, threaded=True)

View File

@ -0,0 +1,11 @@
# Backend Nixpacks configuration
# Ensures ffmpeg is available for Whisper audio processing
[phases.setup]
nixPkgs = ["ffmpeg"]
[phases.install]
cmds = ["pip install -r requirements.txt"]
[start]
cmd = "gunicorn --workers 2 --worker-class sync --timeout 120 --bind 0.0.0.0:$PORT app:app"

View File

@ -0,0 +1,10 @@
Flask==2.3.3
Flask-CORS==4.0.0
python-dotenv==1.0.0
openai-whisper>=20230314
torch>=1.10.1
numpy>=1.21.0
python-multipart==0.0.6
gunicorn==21.2.0

View File

@ -0,0 +1,13 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Farsi Audio/Video Transcriber</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

View File

@ -0,0 +1,11 @@
# Frontend Nixpacks configuration
# Node.js React app with Vite
[phases.install]
cmds = ["npm install"]
[phases.build]
cmds = ["npm run build"]
[start]
cmd = "npm run preview"

2456
farsi_transcriber_web/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,32 @@
{
"name": "farsi-transcriber-web",
"private": true,
"version": "0.1.0",
"type": "module",
"scripts": {
"dev": "vite",
"build": "tsc -b && vite build",
"lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
"preview": "vite preview"
},
"dependencies": {
"@tailwindcss/postcss": "^4.1.17",
"lucide-react": "^0.263.1",
"re-resizable": "^6.9.9",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"sonner": "^1.2.0"
},
"devDependencies": {
"@types/node": "^20.8.0",
"@types/react": "^18.2.37",
"@types/react-dom": "^18.2.15",
"@vitejs/plugin-react": "^4.2.0",
"autoprefixer": "^10.4.16",
"postcss": "^8.4.31",
"tailwindcss": "^4.0.0",
"terser": "^5.44.1",
"typescript": "^5.2.2",
"vite": "^5.0.0"
}
}

View File

@ -0,0 +1,6 @@
export default {
plugins: {
'@tailwindcss/postcss': {},
autoprefixer: {},
},
}

View File

@ -0,0 +1,13 @@
# Railway configuration file
# https://docs.railway.app/reference/nixpacks
[build]
builder = "nixpacks"
[[services]]
name = "backend"
startCommand = "cd backend && gunicorn --workers 2 --worker-class sync --timeout 120 --bind 0.0.0.0:5000 app:app"
[[services]]
name = "frontend"
startCommand = "npm run build && npm run preview"

View File

@ -0,0 +1,537 @@
import { useState, useRef } from 'react';
import {
FileAudio,
Upload,
Moon,
Sun,
Search,
Copy,
X,
CheckCircle2,
Clock,
Loader2,
Download
} from 'lucide-react';
import { Resizable } from 're-resizable';
import { Toaster, toast } from 'sonner';
import Button from './components/Button';
import Progress from './components/Progress';
import Input from './components/Input';
import Select from './components/Select';
interface FileItem {
id: string;
name: string;
status: 'pending' | 'processing' | 'completed' | 'error';
progress?: number;
transcription?: TranscriptionSegment[];
file?: File;
fullText?: string;
}
interface TranscriptionSegment {
start: string;
end: string;
text: string;
}
// Get API URL from environment variable
const API_URL = import.meta.env.VITE_API_URL || 'http://localhost:5000';
export default function App() {
const [fileQueue, setFileQueue] = useState<FileItem[]>([]);
const [selectedFileId, setSelectedFileId] = useState<string | null>(null);
const [isDark, setIsDark] = useState(false);
const [windowSize, setWindowSize] = useState({ width: 1100, height: 700 });
const [searchQuery, setSearchQuery] = useState('');
const [exportFormat, setExportFormat] = useState('txt');
const fileInputRef = useRef<HTMLInputElement>(null);
// Theme colors
const theme = {
bg: isDark ? '#1a1a1a' : '#f5f5f5',
cardBg: isDark ? '#2d2d2d' : '#ffffff',
inputBg: isDark ? '#3a3a3a' : '#f9f9f9',
border: isDark ? '#4a4a4a' : '#d0d0d0',
text: isDark ? '#e0e0e0' : '#333333',
textSecondary: isDark ? '#a0a0a0' : '#666666',
progressBg: isDark ? '#404040' : '#e0e0e0',
sidebarBg: isDark ? '#252525' : '#fafafa',
hoverBg: isDark ? '#3a3a3a' : '#f0f0f0',
selectedBg: isDark ? '#4a4a4a' : '#e8f5e9',
};
const handleAddFiles = () => {
fileInputRef.current?.click();
};
const handleFileInputChange = (e: React.ChangeEvent<HTMLInputElement>) => {
const files = e.currentTarget.files;
if (!files) return;
const newFiles: FileItem[] = [];
for (let i = 0; i < files.length; i++) {
const file = files[i];
const newFileItem: FileItem = {
id: `${Date.now()}-${i}`,
name: file.name,
status: 'pending',
file: file,
};
newFiles.push(newFileItem);
}
setFileQueue([...fileQueue, ...newFiles]);
if (!selectedFileId && newFiles.length > 0) {
setSelectedFileId(newFiles[0].id);
}
toast.success(`${newFiles.length} file(s) added to queue`);
// Reset input
if (fileInputRef.current) {
fileInputRef.current.value = '';
}
};
const handleRemoveFile = (id: string) => {
setFileQueue(fileQueue.filter(f => f.id !== id));
if (selectedFileId === id) {
setSelectedFileId(fileQueue[0]?.id || null);
}
toast.info('File removed from queue');
};
const handleTranscribe = async () => {
if (!selectedFileId) return;
const fileIndex = fileQueue.findIndex(f => f.id === selectedFileId);
if (fileIndex === -1 || !fileQueue[fileIndex].file) return;
// Update status to processing
const updatedQueue = [...fileQueue];
updatedQueue[fileIndex].status = 'processing';
updatedQueue[fileIndex].progress = 0;
setFileQueue(updatedQueue);
try {
const file = fileQueue[fileIndex].file!;
const formData = new FormData();
formData.append('file', file);
formData.append('language', 'fa'); // Farsi by default
// Show loading toast
const loadingToastId = toast.loading('Loading Whisper model (first time only)...');
const response = await fetch(`${API_URL}/transcribe`, {
method: 'POST',
body: formData,
});
if (!response.ok) {
throw new Error(`API error: ${response.statusText}`);
}
const result = await response.json();
// Dismiss loading toast
toast.dismiss(loadingToastId);
if (result.status === 'success') {
const updated = [...fileQueue];
updated[fileIndex].status = 'completed';
updated[fileIndex].progress = 100;
updated[fileIndex].transcription = result.segments;
updated[fileIndex].fullText = result.text;
setFileQueue(updated);
toast.success('Transcription completed!');
} else {
throw new Error(result.error || 'Unknown error');
}
} catch (error) {
const updated = [...fileQueue];
updated[fileIndex].status = 'error';
setFileQueue(updated);
const errorMsg = error instanceof Error ? error.message : 'Failed to transcribe file';
toast.error(errorMsg);
}
};
const handleCopySegment = (text: string) => {
navigator.clipboard.writeText(text);
toast.success('Copied to clipboard');
};
const handleExport = async () => {
const selectedFile = fileQueue.find(f => f.id === selectedFileId);
if (!selectedFile?.transcription || !selectedFile.fullText) {
toast.error('No transcription to export');
return;
}
try {
const toastId = toast.loading('Preparing export...');
const response = await fetch(`${API_URL}/export`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
transcription: selectedFile.fullText,
segments: selectedFile.transcription,
format: exportFormat,
}),
});
if (!response.ok) {
throw new Error(`Export failed: ${response.statusText}`);
}
const result = await response.json();
toast.dismiss(toastId);
if (result.status === 'success') {
// Create a blob and download
const blob = new Blob([result.content], { type: result.mime_type });
const url = window.URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `${selectedFile.name.split('.')[0]}.${exportFormat === 'json' ? 'json' : exportFormat}`;
document.body.appendChild(a);
a.click();
window.URL.revokeObjectURL(url);
document.body.removeChild(a);
toast.success(`Exported as ${exportFormat.toUpperCase()}`);
} else {
throw new Error(result.error || 'Export failed');
}
} catch (error) {
const errorMsg = error instanceof Error ? error.message : 'Failed to export';
toast.error(errorMsg);
}
};
const handleClearAll = () => {
setFileQueue([]);
setSelectedFileId(null);
setSearchQuery('');
toast.info('All files cleared');
};
const selectedFile = fileQueue.find(f => f.id === selectedFileId);
const currentTranscription = selectedFile?.transcription || [];
// Filter transcription based on search
const filteredTranscription = searchQuery
? currentTranscription.filter(seg =>
seg.text.toLowerCase().includes(searchQuery.toLowerCase())
)
: currentTranscription;
// Function to highlight search text
const highlightText = (text: string, query: string) => {
if (!query) return text;
const parts = text.split(new RegExp(`(${query})`, 'gi'));
return parts.map((part) =>
part.toLowerCase() === query.toLowerCase()
? `<mark style="background-color: ${isDark ? '#4CAF50' : '#FFEB3B'}; color: ${isDark ? '#000' : '#000'}; padding: 2px 4px; border-radius: 2px;">${part}</mark>`
: part
).join('');
};
const getStatusIcon = (status: FileItem['status']) => {
switch (status) {
case 'completed':
return <CheckCircle2 className="w-4 h-4 text-green-500" />;
case 'processing':
return <Loader2 className="w-4 h-4 text-blue-500 animate-spin" />;
case 'error':
return <X className="w-4 h-4 text-red-500" />;
default:
return <Clock className="w-4 h-4" style={{ color: theme.textSecondary }} />;
}
};
return (
<div className="min-h-screen flex items-center justify-center p-8" style={{ backgroundColor: theme.bg }}>
<Toaster theme={isDark ? 'dark' : 'light'} position="top-right" />
{/* Hidden file input */}
<input
ref={fileInputRef}
type="file"
multiple
accept="audio/*,video/*"
onChange={handleFileInputChange}
style={{ display: 'none' }}
/>
<Resizable
size={windowSize}
onResizeStop={(_e, _direction, _ref, d) => {
setWindowSize({
width: windowSize.width + d.width,
height: windowSize.height + d.height,
});
}}
minWidth={900}
minHeight={600}
className="rounded-lg shadow-2xl overflow-hidden"
style={{
backgroundColor: theme.cardBg,
border: `2px solid ${theme.border}`,
}}
handleStyles={{
right: { cursor: 'ew-resize' },
bottom: { cursor: 'ns-resize' },
bottomRight: { cursor: 'nwse-resize' },
}}
>
<div className="flex h-full">
{/* Left Sidebar - File Queue */}
<div
className="w-64 border-r flex flex-col overflow-hidden"
style={{ borderColor: theme.border, backgroundColor: theme.sidebarBg }}
>
<div className="p-4 border-b" style={{ borderColor: theme.border }}>
<h3 className="mb-3 font-semibold" style={{ color: theme.text }}>
File Queue
</h3>
<Button
onClick={handleAddFiles}
className="w-full bg-green-500 hover:bg-green-600 text-white"
>
<Upload className="w-4 h-4 mr-2" />
Add Files
</Button>
</div>
<div className="flex-1 overflow-auto p-2">
{fileQueue.length === 0 ? (
<p className="text-center text-xs p-4" style={{ color: theme.textSecondary }}>
No files in queue
</p>
) : (
fileQueue.map((file) => (
<div
key={file.id}
className="mb-2 p-3 rounded-lg cursor-pointer transition-colors border"
style={{
backgroundColor: selectedFileId === file.id ? theme.selectedBg : theme.cardBg,
borderColor: selectedFileId === file.id ? '#4CAF50' : theme.border,
}}
onClick={() => setSelectedFileId(file.id)}
>
<div className="flex items-start justify-between gap-2 mb-2">
<div className="flex items-center gap-2 flex-1 min-w-0">
{getStatusIcon(file.status)}
<span className="text-xs truncate" style={{ color: theme.text }}>
{file.name}
</span>
</div>
<button
onClick={(e) => {
e.stopPropagation();
handleRemoveFile(file.id);
}}
className="hover:opacity-70"
>
<X className="w-3 h-3" style={{ color: theme.textSecondary }} />
</button>
</div>
{file.status === 'processing' && (
<div className="space-y-1">
<Progress value={file.progress || 0} />
<p className="text-xs" style={{ color: theme.textSecondary }}>
{file.progress}%
</p>
</div>
)}
</div>
))
)}
</div>
</div>
{/* Main Content Area */}
<div className="flex-1 flex flex-col overflow-hidden">
{/* Header */}
<div
className="p-5 border-b flex items-center justify-between"
style={{ borderColor: theme.border }}
>
<div className="flex items-center gap-3">
<h1 style={{ color: theme.text }} className="text-lg font-semibold">
Farsi Audio/Video Transcriber
</h1>
<span className="text-xs" style={{ color: theme.textSecondary }}>
{windowSize.width}×{windowSize.height}
</span>
</div>
<Button
onClick={() => setIsDark(!isDark)}
variant="outline"
style={{ borderColor: theme.border, backgroundColor: theme.cardBg }}
>
{isDark ? (
<Sun className="w-4 h-4" style={{ color: theme.text }} />
) : (
<Moon className="w-4 h-4" style={{ color: theme.text }} />
)}
</Button>
</div>
<div className="flex-1 flex flex-col p-5 overflow-hidden">
{/* File Info & Actions */}
<div
className="mb-4 p-4 rounded-lg border"
style={{ backgroundColor: theme.inputBg, borderColor: theme.border }}
>
<div className="flex items-center justify-between">
<div className="flex items-center gap-3">
<FileAudio className="w-5 h-5" style={{ color: theme.textSecondary }} />
<div>
<p className="text-sm" style={{ color: theme.text }}>
{selectedFile ? selectedFile.name : 'No file selected'}
</p>
{selectedFile?.status === 'processing' && (
<p className="text-xs" style={{ color: theme.textSecondary }}>
Processing... {selectedFile.progress}%
</p>
)}
{selectedFile?.status === 'completed' && (
<p className="text-xs text-green-500">Completed</p>
)}
</div>
</div>
<Button
onClick={handleTranscribe}
disabled={!selectedFile || selectedFile.status === 'processing' || selectedFile.status === 'completed'}
className="bg-green-500 hover:bg-green-600 text-white disabled:bg-gray-400 disabled:cursor-not-allowed"
>
{selectedFile?.status === 'processing' ? 'Transcribing...' : 'Transcribe'}
</Button>
</div>
</div>
{/* Search & Export Controls */}
{selectedFile?.transcription && (
<div className="mb-4 flex gap-2">
<div className="flex-1 relative">
<Search
className="w-4 h-4 absolute left-3 top-1/2 -translate-y-1/2"
style={{ color: theme.textSecondary }}
/>
<Input
placeholder="Search in transcription..."
value={searchQuery}
onChange={(e) => setSearchQuery(e.target.value)}
style={{
backgroundColor: theme.inputBg,
borderColor: theme.border,
color: theme.text,
paddingLeft: '2.25rem',
}}
/>
</div>
<Select
value={exportFormat}
onChange={(e) => setExportFormat(e.target.value as 'txt' | 'srt' | 'vtt' | 'json')}
>
<option value="txt">TXT</option>
<option value="srt">SRT</option>
<option value="vtt">VTT</option>
<option value="json">JSON</option>
</Select>
<Button
onClick={handleExport}
variant="outline"
style={{ borderColor: theme.border, backgroundColor: theme.cardBg, color: theme.text }}
>
<Download className="w-4 h-4 mr-2" />
Export
</Button>
</div>
)}
{/* Transcription Results */}
<div className="flex-1 flex flex-col min-h-0">
<div className="flex items-center justify-between mb-2">
<label style={{ color: theme.text }} className="text-sm font-medium">
Transcription Results:
</label>
{searchQuery && (
<span className="text-xs" style={{ color: theme.textSecondary }}>
{filteredTranscription.length} results found
</span>
)}
</div>
<div
className="flex-1 rounded-lg border p-4 overflow-auto"
style={{ backgroundColor: theme.cardBg, borderColor: theme.border }}
>
{currentTranscription.length === 0 ? (
<p className="text-center" style={{ color: theme.textSecondary }}>
Transcription results will appear here...
</p>
) : (
<div className="space-y-3">
{filteredTranscription.map((segment, index) => (
<div
key={index}
className="p-3 rounded-md border group hover:shadow-sm transition-shadow"
style={{
backgroundColor: theme.inputBg,
borderColor: theme.border,
}}
>
<div className="flex items-start justify-between gap-3 mb-2">
<span
className="text-xs font-mono"
style={{ color: theme.textSecondary }}
>
[{segment.start} - {segment.end}]
</span>
<button
onClick={() => handleCopySegment(segment.text)}
className="opacity-0 group-hover:opacity-100 transition-opacity"
title="Copy segment"
>
<Copy className="w-3 h-3" style={{ color: theme.textSecondary }} />
</button>
</div>
<p
className="text-sm leading-relaxed"
style={{ color: theme.text }}
dir="rtl"
dangerouslySetInnerHTML={{ __html: highlightText(segment.text, searchQuery) }}
/>
</div>
))}
</div>
)}
</div>
</div>
{/* Bottom Actions */}
<div className="flex justify-between items-center mt-4">
<p className="text-xs" style={{ color: theme.textSecondary }}>
{selectedFile?.status === 'completed' && `${currentTranscription.length} segments`}
</p>
<Button
onClick={handleClearAll}
variant="outline"
style={{ borderColor: theme.border, backgroundColor: theme.cardBg, color: theme.text }}
>
Clear All
</Button>
</div>
</div>
</div>
</div>
</Resizable>
</div>
);
}

View File

@ -0,0 +1,36 @@
import React from 'react';
interface ButtonProps extends React.ButtonHTMLAttributes<HTMLButtonElement> {
variant?: 'default' | 'outline';
size?: 'sm' | 'md' | 'lg';
children: React.ReactNode;
}
const Button = React.forwardRef<HTMLButtonElement, ButtonProps>(
({ variant = 'default', size = 'md', className, ...props }, ref) => {
const baseStyles = 'font-medium rounded transition-colors disabled:opacity-50 disabled:cursor-not-allowed inline-flex items-center justify-center';
const variantStyles = {
default: 'bg-green-500 hover:bg-green-600 text-white',
outline: 'border border-gray-300 hover:bg-gray-100 text-gray-900',
};
const sizeStyles = {
sm: 'px-3 py-1.5 text-sm',
md: 'px-4 py-2 text-base',
lg: 'px-6 py-3 text-lg',
};
return (
<button
ref={ref}
className={`${baseStyles} ${variantStyles[variant]} ${sizeStyles[size]} ${className || ''}`}
{...props}
/>
);
}
);
Button.displayName = 'Button';
export default Button;

View File

@ -0,0 +1,24 @@
import React from 'react';
interface InputProps extends React.InputHTMLAttributes<HTMLInputElement> {
label?: string;
}
const Input = React.forwardRef<HTMLInputElement, InputProps>(
({ label, className, ...props }, ref) => {
return (
<div className="w-full">
{label && <label className="block text-sm font-medium mb-1">{label}</label>}
<input
ref={ref}
className={`w-full px-3 py-2 border rounded-md text-sm focus:outline-none focus:ring-2 focus:ring-green-500 ${className || ''}`}
{...props}
/>
</div>
);
}
);
Input.displayName = 'Input';
export default Input;

View File

@ -0,0 +1,15 @@
interface ProgressProps {
value: number;
className?: string;
}
export default function Progress({ value, className }: ProgressProps) {
return (
<div className={`w-full bg-gray-200 rounded-full h-1.5 overflow-hidden ${className || ''}`}>
<div
className="bg-green-500 h-full transition-all duration-300"
style={{ width: `${Math.min(100, Math.max(0, value))}%` }}
/>
</div>
);
}

View File

@ -0,0 +1,27 @@
import React from 'react';
interface SelectProps extends React.SelectHTMLAttributes<HTMLSelectElement> {
label?: string;
children: React.ReactNode;
}
const Select = React.forwardRef<HTMLSelectElement, SelectProps>(
({ label, className, children, ...props }, ref) => {
return (
<div className="w-full">
{label && <label className="block text-sm font-medium mb-1">{label}</label>}
<select
ref={ref}
className={`w-full px-3 py-2 border rounded-md text-sm focus:outline-none focus:ring-2 focus:ring-green-500 bg-white ${className || ''}`}
{...props}
>
{children}
</select>
</div>
);
}
);
Select.displayName = 'Select';
export default Select;

View File

@ -0,0 +1,4 @@
export { default as Button } from './Button';
export { default as Progress } from './Progress';
export { default as Input } from './Input';
export { default as Select } from './Select';

View File

@ -0,0 +1,46 @@
@import "tailwindcss";
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
html, body, #root {
width: 100%;
height: 100%;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
sans-serif;
-webkit-font-smoothing: antialiased;
-moz-osx-font-smoothing: grayscale;
}
mark {
display: inline-block;
}
/* RTL Support */
[dir="rtl"] {
text-align: right;
direction: rtl;
}
/* Scrollbar styling */
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background: transparent;
}
::-webkit-scrollbar-thumb {
background: #4CAF50;
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: #45a049;
}

View File

@ -0,0 +1,10 @@
import React from 'react'
import ReactDOM from 'react-dom/client'
import App from './App.tsx'
import './index.css'
ReactDOM.createRoot(document.getElementById('root')!).render(
<React.StrictMode>
<App />
</React.StrictMode>,
)

View File

@ -0,0 +1,17 @@
/** @type {import('tailwindcss').Config} */
export default {
content: [
"./index.html",
"./src/**/*.{js,ts,jsx,tsx}",
],
theme: {
extend: {
colors: {
primary: '#4CAF50',
'primary-hover': '#45a049',
'primary-active': '#3d8b40',
}
},
},
plugins: [],
}

View File

@ -0,0 +1,27 @@
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
"esModuleInterop": true,
"allowSyntheticDefaultImports": true,
/* Bundler mode */
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
/* Linting */
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true
},
"include": ["src"],
"references": [{ "path": "./tsconfig.node.json" }]
}

View File

@ -0,0 +1,10 @@
{
"compilerOptions": {
"composite": true,
"skipLibCheck": true,
"module": "ESNext",
"moduleResolution": "bundler",
"allowSyntheticDefaultImports": true
},
"include": ["vite.config.ts"]
}

View File

@ -0,0 +1,38 @@
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'
// https://vitejs.dev/config/
export default defineConfig({
plugins: [react()],
server: {
port: 3000,
proxy: {
'/api': {
target: 'http://localhost:5000',
changeOrigin: true,
rewrite: (path) => path.replace(/^\/api/, '')
}
}
},
build: {
outDir: 'dist',
sourcemap: false,
minify: 'terser',
terserOptions: {
compress: {
drop_console: true,
},
},
rollupOptions: {
output: {
manualChunks: {
vendor: ['react', 'react-dom'],
},
},
},
},
preview: {
port: parseInt(process.env.PORT || '3000'),
host: '0.0.0.0',
}
})

View File

@ -281,7 +281,7 @@ def transcribe(
time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
window_end_time = float((seek + N_FRAMES) * HOP_LENGTH / SAMPLE_RATE)
segment_size = min(N_FRAMES, content_frames - seek, seek_clip_end - seek)
mel_segment = mel[:, seek : seek + segment_size]
mel_segment = mel[:, seek:seek + segment_size]
segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
@ -444,7 +444,7 @@ def transcribe(
continue
if is_segment_anomaly(segment):
next_segment = next_words_segment(
current_segments[si + 1 :]
current_segments[si + 1:]
)
if next_segment is not None:
hal_next_start = next_segment["words"][0]["start"]
@ -508,7 +508,7 @@ def transcribe(
pbar.update(min(content_frames, seek) - previous_seek)
return dict(
text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
text=tokenizer.decode(all_tokens[len(initial_prompt_tokens):]),
segments=all_segments,
language=language,
)

View File

@ -153,7 +153,7 @@ class SubtitlesWriter(ResultWriter):
if max_words_per_line > len(segment["words"]) - chunk_index:
words_count = remaining_words
for i, original_timing in enumerate(
segment["words"][chunk_index : chunk_index + words_count]
segment["words"][chunk_index:chunk_index + words_count]
):
timing = original_timing.copy()
long_pause = (