diff --git a/farsi_transcriber_web/.gitignore b/farsi_transcriber_web/.gitignore new file mode 100644 index 0000000..9d231bb --- /dev/null +++ b/farsi_transcriber_web/.gitignore @@ -0,0 +1,34 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +node_modules +dist +dist-ssr +*.local + +# Environment variables +.env +.env.local +.env.*.local + +# Editor directories and files +.vscode +.idea +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? + +# OS +.DS_Store +Thumbs.db + +# Build output +*.tgz diff --git a/farsi_transcriber_web/README.md b/farsi_transcriber_web/README.md new file mode 100644 index 0000000..1737d96 --- /dev/null +++ b/farsi_transcriber_web/README.md @@ -0,0 +1,384 @@ +# Farsi Transcriber - Web Application + +A professional web-based application for transcribing Farsi audio and video files using OpenAI's Whisper model. + +## Features + +✨ **Core Features** +- 🎙️ Transcribe audio files (MP3, WAV, M4A, FLAC, OGG, AAC, WMA) +- 🎬 Extract audio from video files (MP4, MKV, MOV, WebM, AVI, FLV, WMV) +- 🇮🇷 High-accuracy Farsi/Persian language transcription +- ⏱️ Word-level timestamps for precise timing +- 📤 Export to multiple formats (TXT, SRT, VTT, JSON) +- 💻 Clean, intuitive React-based UI with Figma design +- 🎨 Dark/Light theme toggle +- 🔍 Search and text highlighting in transcriptions +- 📋 File queue management +- 💾 Copy individual transcription segments +- 🚀 GPU acceleration support (CUDA) +- 🎯 Resizable window for flexible workspace + +## Tech Stack + +**Frontend:** +- React 18+ with TypeScript +- Vite (fast build tool) +- Tailwind CSS v4.0 +- Lucide React (icons) +- re-resizable (window resizing) +- Sonner (toast notifications) + +**Backend:** +- Flask (Python web framework) +- OpenAI Whisper (speech recognition) +- PyTorch (deep learning) +- Flask-CORS (cross-origin requests) + +## System Requirements + +**Frontend:** +- Node.js 16+ +- npm/yarn/pnpm + +**Backend:** +- Python 3.8+ +- 4GB RAM minimum +- 8GB+ recommended +- ffmpeg installed +- Optional: NVIDIA GPU with CUDA support + +## Installation + +### Step 1: Install ffmpeg + +Choose your operating system: + +**Ubuntu/Debian:** +```bash +sudo apt update && sudo apt install ffmpeg +``` + +**macOS (Homebrew):** +```bash +brew install ffmpeg +``` + +**Windows (Chocolatey):** +```bash +choco install ffmpeg +``` + +### Step 2: Backend Setup + +```bash +# Navigate to backend directory +cd backend + +# Create virtual environment +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# Install dependencies +pip install -r requirements.txt +``` + +### Step 3: Frontend Setup + +```bash +# Navigate to root directory +cd .. + +# Install Node dependencies +npm install + +# Or use yarn/pnpm +yarn install +# or +pnpm install +``` + +## Running the Application + +### Step 1: Start Backend API + +```bash +cd backend +source venv/bin/activate # Activate virtual environment +python app.py +``` + +The API will be available at `http://localhost:5000` + +### Step 2: Start Frontend Dev Server + +In a new terminal: + +```bash +npm run dev +``` + +The application will be available at `http://localhost:3000` + +## Building for Production + +### Frontend Build + +```bash +npm run build +``` + +This creates optimized production build in `dist/` directory. + +### Backend Deployment + +For production, use a production WSGI server: + +```bash +# Install Gunicorn +pip install gunicorn + +# Run with Gunicorn +gunicorn -w 4 -b 0.0.0.0:5000 app:app +``` + +## API Endpoints + +### `/health` (GET) +Health check endpoint + +**Response:** +```json +{ + "status": "healthy", + "model_loaded": true, + "device": "cuda|cpu" +} +``` + +### `/transcribe` (POST) +Transcribe audio/video file + +**Request:** +- `file`: Audio/video file (multipart/form-data) +- `language`: Language code (optional, default: "fa" for Farsi) + +**Response:** +```json +{ + "status": "success", + "filename": "audio.mp3", + "language": "fa", + "text": "Full transcription text...", + "segments": [ + { + "start": "00:00:00.000", + "end": "00:00:05.500", + "text": "سلام دنیا" + } + ] +} +``` + +### `/models` (GET) +Get available Whisper models + +**Response:** +```json +{ + "available_models": ["tiny", "base", "small", "medium", "large"], + "current_model": "medium", + "description": "..." +} +``` + +### `/export` (POST) +Export transcription + +**Request:** +```json +{ + "transcription": "Full text...", + "segments": [...], + "format": "txt|srt|vtt|json" +} +``` + +**Response:** +```json +{ + "status": "success", + "format": "srt", + "content": "...", + "mime_type": "text/plain" +} +``` + +## Usage Guide + +### 1. Add Files to Queue +- Click "Add Files" button in the left sidebar +- Select audio or video files +- Multiple files can be added to the queue + +### 2. Transcribe +- Select a file from the queue +- Click "Transcribe" button +- Watch the progress indicator +- Results appear with timestamps + +### 3. Search & Copy +- Use the search bar to find specific text +- Matching text is highlighted +- Click copy icon to copy individual segments + +### 4. Export Results +- Select export format (TXT, SRT, VTT, JSON) +- Click "Export" button +- File is downloaded or ready to save + +### 5. Theme Toggle +- Click sun/moon icon in header +- Switch between light and dark themes + +## Project Structure + +``` +farsi_transcriber_web/ +├── src/ +│ ├── App.tsx # Main application component +│ ├── main.tsx # React entry point +│ ├── index.css # Global styles +│ └── components/ +│ ├── Button.tsx +│ ├── Progress.tsx +│ ├── Input.tsx +│ └── Select.tsx +├── backend/ +│ ├── app.py # Flask API server +│ ├── requirements.txt # Python dependencies +│ └── .gitignore +├── public/ +├── package.json +├── vite.config.ts +├── tsconfig.json +├── tailwind.config.js +├── postcss.config.js +└── README.md +``` + +## Configuration + +### Environment Variables + +Create a `.env.local` file in the root directory: + +``` +VITE_API_URL=http://localhost:5000 +VITE_MAX_FILE_SIZE=500MB +``` + +### Backend Configuration + +Edit `backend/app.py` to customize: + +```python +# Change model size +model = whisper.load_model('large') # tiny, base, small, medium, large + +# Change upload folder +UPLOAD_FOLDER = '/custom/path' + +# Change max file size +MAX_FILE_SIZE = 1024 * 1024 * 1024 # 1GB +``` + +## Troubleshooting + +### Issue: "API connection failed" +**Solution**: Ensure backend is running on `http://localhost:5000` + +### Issue: "Whisper model not found" +**Solution**: First run downloads the model (~3GB). Ensure internet connection and disk space. + +### Issue: "CUDA out of memory" +**Solution**: Use smaller model or reduce batch size in `backend/app.py` + +### Issue: "ffmpeg not found" +**Solution**: Install ffmpeg using your package manager (see Installation section) + +### Issue: Port 3000 or 5000 already in use +**Solution**: Change ports in `vite.config.ts` and `backend/app.py` + +## Performance Tips + +1. **Use GPU** - Ensure NVIDIA CUDA is properly installed +2. **Choose appropriate model** - Balance speed vs accuracy +3. **Close other applications** - Free up RAM/VRAM +4. **Use SSD** - Faster model loading and file I/O +5. **Batch Processing** - Process multiple files sequentially + +## Future Enhancements + +- [ ] Drag-and-drop file upload +- [ ] Audio playback synchronized with transcription +- [ ] Edit segments inline +- [ ] Keyboard shortcuts +- [ ] Save/load sessions +- [ ] Speaker diarization +- [ ] Confidence scores +- [ ] Custom vocabulary support + +## Development + +### Code Style + +```bash +# Format code (if ESLint configured) +npm run lint + +# Build for development +npm run dev + +# Build for production +npm run build +``` + +### Adding Components + +New components go in `src/components/` and should: +- Use TypeScript +- Include prop interfaces +- Export as default +- Include JSDoc comments + +## Common Issues & Solutions + +| Issue | Solution | +|-------|----------| +| Models slow to load | GPU required for fast transcription | +| File not supported | Check file extension is in supported list | +| Transcription has errors | Try larger model (medium/large) | +| Application crashes | Check browser console and Flask logs | +| Export not working | Ensure segments data is complete | + +## License + +MIT License - Personal use and modifications allowed + +## Credits + +Built with: +- [OpenAI Whisper](https://github.com/openai/whisper) - Speech recognition +- [React](https://react.dev/) - UI framework +- [Vite](https://vitejs.dev/) - Build tool +- [Tailwind CSS](https://tailwindcss.com/) - Styling +- [Flask](https://flask.palletsprojects.com/) - Backend framework + +## Support + +For issues: +1. Check the troubleshooting section +2. Verify ffmpeg is installed +3. Check Flask backend logs +4. Review browser console for errors +5. Ensure Python 3.8+ and Node.js 16+ are installed diff --git a/farsi_transcriber_web/backend/.gitignore b/farsi_transcriber_web/backend/.gitignore new file mode 100644 index 0000000..801d628 --- /dev/null +++ b/farsi_transcriber_web/backend/.gitignore @@ -0,0 +1,42 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# Virtual Environment +venv/ +ENV/ +env/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# ML Models +*.pt +*.pth +~/.cache/whisper/ + +# Uploads +/uploads +/tmp diff --git a/farsi_transcriber_web/backend/app.py b/farsi_transcriber_web/backend/app.py new file mode 100644 index 0000000..e92d820 --- /dev/null +++ b/farsi_transcriber_web/backend/app.py @@ -0,0 +1,199 @@ +""" +Farsi Transcriber Backend API + +Flask API for handling audio/video file transcription using Whisper model. +""" + +import os +import sys +from pathlib import Path +from werkzeug.utils import secure_filename +import whisper +from flask import Flask, request, jsonify +from flask_cors import CORS + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +app = Flask(__name__) +CORS(app) + +# Configuration +UPLOAD_FOLDER = '/tmp/farsi_transcriber_uploads' +ALLOWED_EXTENSIONS = {'mp3', 'wav', 'm4a', 'flac', 'ogg', 'aac', 'wma', 'mp4', 'mkv', 'mov', 'webm', 'avi', 'flv', 'wmv'} +MAX_FILE_SIZE = 500 * 1024 * 1024 # 500MB + +os.makedirs(UPLOAD_FOLDER, exist_ok=True) +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER +app.config['MAX_CONTENT_LENGTH'] = MAX_FILE_SIZE + +# Load Whisper model +try: + model = whisper.load_model('medium') + print("✓ Whisper model loaded successfully") +except Exception as e: + print(f"✗ Error loading Whisper model: {e}") + model = None + + +def allowed_file(filename): + """Check if file has allowed extension""" + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + + +@app.route('/health', methods=['GET']) +def health(): + """Health check endpoint""" + return jsonify({ + 'status': 'healthy', + 'model_loaded': model is not None, + 'device': 'cuda' if model else 'N/A' + }) + + +@app.route('/transcribe', methods=['POST']) +def transcribe(): + """ + Transcribe audio/video file + + Request: + - file: Audio/video file + - language: Language code (default: 'fa' for Farsi) + + Response: + - transcription results with segments and timestamps + """ + try: + # Check if model is loaded + if not model: + return jsonify({'error': 'Whisper model not loaded'}), 500 + + # Check if file is in request + if 'file' not in request.files: + return jsonify({'error': 'No file provided'}), 400 + + file = request.files['file'] + + if file.filename == '': + return jsonify({'error': 'No file selected'}), 400 + + if not allowed_file(file.filename): + return jsonify({'error': 'File type not allowed'}), 400 + + # Save file + filename = secure_filename(file.filename) + filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename) + file.save(filepath) + + # Get language code from request (default: Farsi) + language = request.form.get('language', 'fa') + + # Transcribe + result = model.transcribe(filepath, language=language, verbose=False) + + # Format response + segments = [] + for segment in result.get('segments', []): + segments.append({ + 'start': f"{int(segment['start'] // 3600):02d}:{int((segment['start'] % 3600) // 60):02d}:{int(segment['start'] % 60):02d}.{int((segment['start'] % 1) * 1000):03d}", + 'end': f"{int(segment['end'] // 3600):02d}:{int((segment['end'] % 3600) // 60):02d}:{int(segment['end'] % 60):02d}.{int((segment['end'] % 1) * 1000):03d}", + 'text': segment['text'].strip(), + }) + + # Clean up uploaded file + try: + os.remove(filepath) + except: + pass + + return jsonify({ + 'status': 'success', + 'filename': filename, + 'language': result.get('language', 'unknown'), + 'text': result.get('text', ''), + 'segments': segments + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +@app.route('/models', methods=['GET']) +def get_models(): + """Get available Whisper models""" + return jsonify({ + 'available_models': ['tiny', 'base', 'small', 'medium', 'large'], + 'current_model': 'medium', + 'description': 'List of available Whisper models. Larger models are more accurate but slower.' + }) + + +@app.route('/export', methods=['POST']) +def export(): + """ + Export transcription in specified format + + Request: + - transcription: Full transcription text + - segments: Array of segments with timestamps + - format: Export format (txt, srt, vtt, json) + + Response: + - Exported file content + """ + try: + data = request.json + transcription = data.get('transcription', '') + segments = data.get('segments', []) + format_type = data.get('format', 'txt').lower() + + if format_type == 'txt': + content = transcription + mime_type = 'text/plain' + elif format_type == 'srt': + content = _format_srt(segments) + mime_type = 'text/plain' + elif format_type == 'vtt': + content = _format_vtt(segments) + mime_type = 'text/plain' + elif format_type == 'json': + import json + content = json.dumps({'text': transcription, 'segments': segments}, ensure_ascii=False, indent=2) + mime_type = 'application/json' + else: + return jsonify({'error': 'Unsupported format'}), 400 + + return jsonify({ + 'status': 'success', + 'format': format_type, + 'content': content, + 'mime_type': mime_type + }) + + except Exception as e: + return jsonify({'error': str(e)}), 500 + + +def _format_srt(segments): + """Format transcription as SRT subtitle format""" + lines = [] + for i, segment in enumerate(segments, 1): + lines.append(str(i)) + lines.append(f"{segment['start']} --> {segment['end']}") + lines.append(segment['text']) + lines.append('') + return '\n'.join(lines) + + +def _format_vtt(segments): + """Format transcription as WebVTT subtitle format""" + lines = ['WEBVTT', ''] + for segment in segments: + lines.append(f"{segment['start']} --> {segment['end']}") + lines.append(segment['text']) + lines.append('') + return '\n'.join(lines) + + +if __name__ == '__main__': + app.run(debug=True, host='0.0.0.0', port=5000) diff --git a/farsi_transcriber_web/backend/requirements.txt b/farsi_transcriber_web/backend/requirements.txt new file mode 100644 index 0000000..3f1a28f --- /dev/null +++ b/farsi_transcriber_web/backend/requirements.txt @@ -0,0 +1,6 @@ +Flask==2.3.3 +Flask-CORS==4.0.0 +python-dotenv==1.0.0 +openai-whisper==20230314 +torch>=1.10.1 +python-multipart==0.0.6 diff --git a/farsi_transcriber_web/index.html b/farsi_transcriber_web/index.html new file mode 100644 index 0000000..8cf5754 --- /dev/null +++ b/farsi_transcriber_web/index.html @@ -0,0 +1,13 @@ + + +
+ + + ++ No files in queue +
+ ) : ( + fileQueue.map((file) => ( ++ {file.progress}% +
++ {selectedFile ? selectedFile.name : 'No file selected'} +
+ {selectedFile?.status === 'processing' && ( ++ Processing... {selectedFile.progress}% +
+ )} + {selectedFile?.status === 'completed' && ( +Completed
+ )} ++ Transcription results will appear here... +
+ ) : ( ++ {selectedFile?.status === 'completed' && `${currentTranscription.length} segments`} +
+ +